On 23.01.2025 08:21, Andrey Borodin wrote:
<v11-0001-Allow-setting-sample-ratio-for-pg_stat_statements.patch>
There’s a typo in the commit message (ratio instead of rate). Besides this the 
patch looks ready for committer.


Best regards, Andrey Borodin.


Fixed. Thank you for review!

I noticed that the code has not enough comments, so I added additional ones to improve clarity. Additionally, I moved the update_current_query_sampled() function to the end of the file, as keeping it between hooks didn’t seem appropriate to me.

All these changes are included in the updated patch (v12), which I have attached.

The patch is part of the current 2025-01 Commitfest: https://commitfest.postgresql.org/51/5390/

--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC.
From ddbc6af2af511ff342b183cb13a9027edadc0ad3 Mon Sep 17 00:00:00 2001
From: Ilia Evdokimov <ilya.evdoki...@tantorlabs.ru>
Date: Thu, 23 Jan 2025 11:31:57 +0300
Subject: [PATCH v1] Allow setting sample ratio for pg_stat_statements

New configuration parameter pg_stat_statements.sample_rate makes it
possible to track just a fraction of the queries meeting the configured
threshold, to reduce the amount of tracking.
---
 .../pg_stat_statements/expected/select.out    | 76 +++++++++++++++++++
 .../pg_stat_statements/pg_stat_statements.c   | 73 +++++++++++++++---
 contrib/pg_stat_statements/sql/select.sql     | 21 +++++
 doc/src/sgml/pgstatstatements.sgml            | 18 +++++
 4 files changed, 179 insertions(+), 9 deletions(-)

diff --git a/contrib/pg_stat_statements/expected/select.out b/contrib/pg_stat_statements/expected/select.out
index 37a30af034..558d93fb46 100644
--- a/contrib/pg_stat_statements/expected/select.out
+++ b/contrib/pg_stat_statements/expected/select.out
@@ -153,6 +153,82 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t;
  t
 (1 row)
 
+--
+-- sample statements
+--
+SET pg_stat_statements.sample_rate = 0.0;
+SELECT 1 AS "int";
+ int 
+-----
+   1
+(1 row)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                       query                        | calls 
+----------------------------------------------------+-------
+ SELECT pg_stat_statements_reset() IS NOT NULL AS t |     1
+(1 row)
+
+SET pg_stat_statements.sample_rate = 1.0;
+SELECT 1 AS "int";
+ int 
+-----
+   1
+(1 row)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                 query                                  | calls 
+------------------------------------------------------------------------+-------
+ SELECT $1 AS "int"                                                     |     1
+ SELECT pg_stat_statements_reset() IS NOT NULL AS t                     |     1
+ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" |     0
+(3 rows)
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
+set pg_stat_statements.sample_rate = 0.0;
+select pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT 1 \parse stmt
+\bind_named stmt \g
+ ?column? 
+----------
+        1
+(1 row)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+ query | calls 
+-------+-------
+(0 rows)
+
+set pg_stat_statements.sample_rate = 1.0;
+\bind_named stmt \g
+ ?column? 
+----------
+        1
+(1 row)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                 query                                  | calls 
+------------------------------------------------------------------------+-------
+ SELECT $1                                                              |     1
+ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" |     0
+(2 rows)
+
+select pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
 --
 -- queries with locking clauses
 --
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index bebf8134eb..92ba954d41 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -50,6 +50,7 @@
 #include "access/parallel.h"
 #include "catalog/pg_authid.h"
 #include "common/int.h"
+#include "common/pg_prng.h"
 #include "executor/instrument.h"
 #include "funcapi.h"
 #include "jit/jit.h"
@@ -294,6 +295,10 @@ static bool pgss_track_utility = true;	/* whether to track utility commands */
 static bool pgss_track_planning = false;	/* whether to track planning
 											 * duration */
 static bool pgss_save = true;	/* whether to save stats across shutdown */
+static double pgss_sample_rate = 1.0; /* fraction of statements to track */
+
+/* Is the current top-level query to be sampled? */
+static bool current_query_sampled = false;
 
 
 #define pgss_enabled(level) \
@@ -373,6 +378,7 @@ static char *generate_normalized_query(JumbleState *jstate, const char *query,
 static void fill_in_constant_lengths(JumbleState *jstate, const char *query,
 									 int query_loc);
 static int	comp_location(const void *a, const void *b);
+static void	update_current_query_sampled(void);
 
 
 /*
@@ -414,6 +420,19 @@ _PG_init(void)
 							NULL,
 							NULL);
 
+	DefineCustomRealVariable("pg_stat_statements.sample_rate",
+							 "Fraction of queries to track.",
+							 NULL,
+							 &pgss_sample_rate,
+							 1.0,
+							 0.0,
+							 1.0,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
 	DefineCustomEnumVariable("pg_stat_statements.track",
 							 "Selects which statements are tracked by pg_stat_statements.",
 							 NULL,
@@ -889,12 +908,16 @@ pgss_planner(Query *parse,
 {
 	PlannedStmt *result;
 
+	/* Whether the current query is sampled based on sample_rate. */
+	update_current_query_sampled();
+
 	/*
-	 * We can't process the query if no query_string is provided, as
-	 * pgss_store needs it.  We also ignore query without queryid, as it would
-	 * be treated as a utility statement, which may not be the case.
+	 * We can't process the query if it is not sampled or
+	 * if no query_string is provided, as pgss_store needs it.
+	 * We also ignore query without queryid, as it would be treated
+	 * as a utility statement, which may not be the case.
 	 */
-	if (pgss_enabled(nesting_level)
+	if (pgss_enabled(nesting_level) && current_query_sampled
 		&& pgss_track_planning && query_string
 		&& parse->queryId != UINT64CONST(0))
 	{
@@ -994,12 +1017,16 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
 	else
 		standard_ExecutorStart(queryDesc, eflags);
 
+	/* Whether the current query is sampled based on sample_rate. */
+	update_current_query_sampled();
+
 	/*
-	 * If query has queryId zero, don't track it.  This prevents double
-	 * counting of optimizable statements that are directly contained in
-	 * utility statements.
+	 * If query has queryId zero or if query is not sampled, don't track it.
+	 * This prevents double counting of optimizable statements
+	 * that are directly contained in utility statements.
 	 */
-	if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != UINT64CONST(0))
+	if (pgss_enabled(nesting_level) && current_query_sampled &&
+		queryDesc->plannedstmt->queryId != UINT64CONST(0))
 	{
 		/*
 		 * Set up to track total elapsed time in ExecutorRun.  Make sure the
@@ -1111,7 +1138,13 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 	uint64		saved_queryId = pstmt->queryId;
 	int			saved_stmt_location = pstmt->stmt_location;
 	int			saved_stmt_len = pstmt->stmt_len;
-	bool		enabled = pgss_track_utility && pgss_enabled(nesting_level);
+	bool		enabled;
+
+	/* Whether the current query is sampled based on sample_rate. */
+	update_current_query_sampled();
+
+	enabled = pgss_track_utility && pgss_enabled(nesting_level) &&
+				current_query_sampled;
 
 	/*
 	 * Force utility statements to get queryId zero.  We do this even in cases
@@ -3011,3 +3044,25 @@ comp_location(const void *a, const void *b)
 
 	return pg_cmp_s32(l, r);
 }
+
+/*
+ * At the beginning of each top-level statement, decide whether we'll
+ * sample this statement.  If nested-statement tracking is enabled,
+ * either all nested statements will be tracked or none will.
+ *
+ * When in a parallel worker, we should do nothing, which we can implement
+ * cheaply by pretending we decided not to sample the current statement.
+ */
+static void
+update_current_query_sampled(void)
+{
+	if (nesting_level == 0)
+	{
+		if (!IsParallelWorker())
+			current_query_sampled = pgss_sample_rate != 0.0 &&
+				(pgss_sample_rate == 1.0 ||
+				pg_prng_double(&pg_global_prng_state) <= pgss_sample_rate);
+		else
+			current_query_sampled = false;
+	}
+}
\ No newline at end of file
diff --git a/contrib/pg_stat_statements/sql/select.sql b/contrib/pg_stat_statements/sql/select.sql
index e0be58d5e2..21f09ef94a 100644
--- a/contrib/pg_stat_statements/sql/select.sql
+++ b/contrib/pg_stat_statements/sql/select.sql
@@ -59,6 +59,27 @@ DEALLOCATE pgss_test;
 SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
 
+--
+-- sample statements
+--
+SET pg_stat_statements.sample_rate = 0.0;
+SELECT 1 AS "int";
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+SET pg_stat_statements.sample_rate = 1.0;
+SELECT 1 AS "int";
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+set pg_stat_statements.sample_rate = 0.0;
+select pg_stat_statements_reset() IS NOT NULL AS t;
+SELECT 1 \parse stmt
+\bind_named stmt \g
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+set pg_stat_statements.sample_rate = 1.0;
+\bind_named stmt \g
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+select pg_stat_statements_reset() IS NOT NULL AS t;
+
 --
 -- queries with locking clauses
 --
diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml
index 501b468e9a..8ade4e3ced 100644
--- a/doc/src/sgml/pgstatstatements.sgml
+++ b/doc/src/sgml/pgstatstatements.sgml
@@ -936,6 +936,24 @@
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term>
+     <varname>pg_stat_statements.sample_rate</varname> (<type>real</type>)
+     <indexterm>
+      <primary><varname>pg_stat_statements.sample_rate</varname> configuration parameter</primary>
+     </indexterm>
+    </term>
+
+    <listitem>
+     <para>
+      <varname>pg_stat_statements.sample_rate</varname> causes pg_stat_statements to only
+      track a fraction of the statements in each session. The default is 1,
+      meaning track all the queries. In case of nested statements,
+      either all will be tracked or none. Only superusers can change this setting.
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry>
     <term>
      <varname>pg_stat_statements.save</varname> (<type>boolean</type>)
-- 
2.34.1

Reply via email to