From 2fb151b850f87a3190e49ecb77e95cbc71ca9f07 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-46-230.ec2.internal>
Date: Thu, 18 Dec 2025 18:59:31 +0000
Subject: [PATCH v2 1/1] pg_stat_statements: enforce single query normalization
 per JumbleState

A JumbleState instance can be used by more than one extension, and all
will use the same query string and constant locations produced by core
query jumbling. However, generate_normalized_query in
pg_stat_statements currently modifies the constant lengths stored in
JumbleState, which could be overwritten by another extension implementing
its own normalization logic. While this has not been reported in practice,
it is an undesirable interaction and should be fixed.

To better enforce this invariant, expose a normalization callback in
JumbleState initialized to generate_normalized_query as part of moving all
related normalization code from pg_stat_statements into core query jumbling.
Normalization is invoked through this callback, ensuring that all hooks
on the call chain accessing the same JumbleState use the same normalization
function.

This refactor renames the functions to match the naming conventions used
by the core query jumbling code in queryjumblefuncs.c, and updates code
comments that no longer apply or that benefit from clarification.

Discussion: https://postgr.es/m/aQA9v9nLu5qsX8IE%40paquier.xyz
---
 .../pg_stat_statements/pg_stat_statements.c   | 277 +-----------------
 src/backend/nodes/queryjumblefuncs.c          | 251 ++++++++++++++++
 src/include/nodes/queryjumble.h               |   9 +
 3 files changed, 271 insertions(+), 266 deletions(-)

diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 39208f80b5b..5c11939b656 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -50,7 +50,6 @@
 #include "access/htup_details.h"
 #include "access/parallel.h"
 #include "catalog/pg_authid.h"
-#include "common/int.h"
 #include "executor/instrument.h"
 #include "funcapi.h"
 #include "jit/jit.h"
@@ -59,7 +58,6 @@
 #include "nodes/queryjumble.h"
 #include "optimizer/planner.h"
 #include "parser/analyze.h"
-#include "parser/scanner.h"
 #include "pgstat.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
@@ -377,11 +375,6 @@ static char *qtext_fetch(Size query_offset, int query_len,
 static bool need_gc_qtexts(void);
 static void gc_qtexts(void);
 static TimestampTz entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only);
-static char *generate_normalized_query(JumbleState *jstate, const char *query,
-									   int query_loc, int *query_len_p);
-static void fill_in_constant_lengths(JumbleState *jstate, const char *query,
-									 int query_loc);
-static int	comp_location(const void *a, const void *b);
 
 
 /*
@@ -1359,9 +1352,17 @@ pgss_store(const char *query, int64 queryId,
 		if (jstate)
 		{
 			LWLockRelease(pgss->lock);
-			norm_query = generate_normalized_query(jstate, query,
-												   query_location,
-												   &query_len);
+
+			/*
+			 * generate the normalized query. Note that the normalized
+			 * representation may well vary depending on just which
+			 * "equivalent" query is used to create the hashtable entry. We
+			 * assume this is OK.
+			 */
+			Assert(jstate->normalize_query != NULL);
+			norm_query = jstate->normalize_query(jstate, query,
+												 query_location,
+												 &query_len);
 			LWLockAcquire(pgss->lock, LW_SHARED);
 		}
 
@@ -2823,259 +2824,3 @@ release_lock:
 
 	return stats_reset;
 }
-
-/*
- * Generate a normalized version of the query string that will be used to
- * represent all similar queries.
- *
- * Note that the normalized representation may well vary depending on
- * just which "equivalent" query is used to create the hashtable entry.
- * We assume this is OK.
- *
- * If query_loc > 0, then "query" has been advanced by that much compared to
- * the original string start, so we need to translate the provided locations
- * to compensate.  (This lets us avoid re-scanning statements before the one
- * of interest, so it's worth doing.)
- *
- * *query_len_p contains the input string length, and is updated with
- * the result string length on exit.  The resulting string might be longer
- * or shorter depending on what happens with replacement of constants.
- *
- * Returns a palloc'd string.
- */
-static char *
-generate_normalized_query(JumbleState *jstate, const char *query,
-						  int query_loc, int *query_len_p)
-{
-	char	   *norm_query;
-	int			query_len = *query_len_p;
-	int			norm_query_buflen,	/* Space allowed for norm_query */
-				len_to_wrt,		/* Length (in bytes) to write */
-				quer_loc = 0,	/* Source query byte location */
-				n_quer_loc = 0, /* Normalized query byte location */
-				last_off = 0,	/* Offset from start for previous tok */
-				last_tok_len = 0;	/* Length (in bytes) of that tok */
-	int			num_constants_replaced = 0;
-
-	/*
-	 * Get constants' lengths (core system only gives us locations).  Note
-	 * this also ensures the items are sorted by location.
-	 */
-	fill_in_constant_lengths(jstate, query, query_loc);
-
-	/*
-	 * Allow for $n symbols to be longer than the constants they replace.
-	 * Constants must take at least one byte in text form, while a $n symbol
-	 * certainly isn't more than 11 bytes, even if n reaches INT_MAX.  We
-	 * could refine that limit based on the max value of n for the current
-	 * query, but it hardly seems worth any extra effort to do so.
-	 */
-	norm_query_buflen = query_len + jstate->clocations_count * 10;
-
-	/* Allocate result buffer */
-	norm_query = palloc(norm_query_buflen + 1);
-
-	for (int i = 0; i < jstate->clocations_count; i++)
-	{
-		int			off,		/* Offset from start for cur tok */
-					tok_len;	/* Length (in bytes) of that tok */
-
-		/*
-		 * If we have an external param at this location, but no lists are
-		 * being squashed across the query, then we skip here; this will make
-		 * us print the characters found in the original query that represent
-		 * the parameter in the next iteration (or after the loop is done),
-		 * which is a bit odd but seems to work okay in most cases.
-		 */
-		if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists)
-			continue;
-
-		off = jstate->clocations[i].location;
-
-		/* Adjust recorded location if we're dealing with partial string */
-		off -= query_loc;
-
-		tok_len = jstate->clocations[i].length;
-
-		if (tok_len < 0)
-			continue;			/* ignore any duplicates */
-
-		/* Copy next chunk (what precedes the next constant) */
-		len_to_wrt = off - last_off;
-		len_to_wrt -= last_tok_len;
-		Assert(len_to_wrt >= 0);
-		memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
-		n_quer_loc += len_to_wrt;
-
-		/*
-		 * And insert a param symbol in place of the constant token; and, if
-		 * we have a squashable list, insert a placeholder comment starting
-		 * from the list's second value.
-		 */
-		n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s",
-							  num_constants_replaced + 1 + jstate->highest_extern_param_id,
-							  jstate->clocations[i].squashed ? " /*, ... */" : "");
-		num_constants_replaced++;
-
-		/* move forward */
-		quer_loc = off + tok_len;
-		last_off = off;
-		last_tok_len = tok_len;
-	}
-
-	/*
-	 * We've copied up until the last ignorable constant.  Copy over the
-	 * remaining bytes of the original query string.
-	 */
-	len_to_wrt = query_len - quer_loc;
-
-	Assert(len_to_wrt >= 0);
-	memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
-	n_quer_loc += len_to_wrt;
-
-	Assert(n_quer_loc <= norm_query_buflen);
-	norm_query[n_quer_loc] = '\0';
-
-	*query_len_p = n_quer_loc;
-	return norm_query;
-}
-
-/*
- * Given a valid SQL string and an array of constant-location records,
- * fill in the textual lengths of those constants.
- *
- * The constants may use any allowed constant syntax, such as float literals,
- * bit-strings, single-quoted strings and dollar-quoted strings.  This is
- * accomplished by using the public API for the core scanner.
- *
- * It is the caller's job to ensure that the string is a valid SQL statement
- * with constants at the indicated locations.  Since in practice the string
- * has already been parsed, and the locations that the caller provides will
- * have originated from within the authoritative parser, this should not be
- * a problem.
- *
- * Multiple constants can have the same location.  We reset lengths of those
- * past the first to -1 so that they can later be ignored.
- *
- * If query_loc > 0, then "query" has been advanced by that much compared to
- * the original string start, so we need to translate the provided locations
- * to compensate.  (This lets us avoid re-scanning statements before the one
- * of interest, so it's worth doing.)
- *
- * N.B. There is an assumption that a '-' character at a Const location begins
- * a negative numeric constant.  This precludes there ever being another
- * reason for a constant to start with a '-'.
- */
-static void
-fill_in_constant_lengths(JumbleState *jstate, const char *query,
-						 int query_loc)
-{
-	LocationLen *locs;
-	core_yyscan_t yyscanner;
-	core_yy_extra_type yyextra;
-	core_YYSTYPE yylval;
-	YYLTYPE		yylloc;
-
-	/*
-	 * Sort the records by location so that we can process them in order while
-	 * scanning the query text.
-	 */
-	if (jstate->clocations_count > 1)
-		qsort(jstate->clocations, jstate->clocations_count,
-			  sizeof(LocationLen), comp_location);
-	locs = jstate->clocations;
-
-	/* initialize the flex scanner --- should match raw_parser() */
-	yyscanner = scanner_init(query,
-							 &yyextra,
-							 &ScanKeywords,
-							 ScanKeywordTokens);
-
-	/* we don't want to re-emit any escape string warnings */
-	yyextra.escape_string_warning = false;
-
-	/* Search for each constant, in sequence */
-	for (int i = 0; i < jstate->clocations_count; i++)
-	{
-		int			loc;
-		int			tok;
-
-		/* Ignore constants after the first one in the same location */
-		if (i > 0 && locs[i].location == locs[i - 1].location)
-		{
-			locs[i].length = -1;
-			continue;
-		}
-
-		if (locs[i].squashed)
-			continue;			/* squashable list, ignore */
-
-		/* Adjust recorded location if we're dealing with partial string */
-		loc = locs[i].location - query_loc;
-		Assert(loc >= 0);
-
-		/*
-		 * We have a valid location for a constant that's not a dupe. Lex
-		 * tokens until we find the desired constant.
-		 */
-		for (;;)
-		{
-			tok = core_yylex(&yylval, &yylloc, yyscanner);
-
-			/* We should not hit end-of-string, but if we do, behave sanely */
-			if (tok == 0)
-				break;			/* out of inner for-loop */
-
-			/*
-			 * We should find the token position exactly, but if we somehow
-			 * run past it, work with that.
-			 */
-			if (yylloc >= loc)
-			{
-				if (query[loc] == '-')
-				{
-					/*
-					 * It's a negative value - this is the one and only case
-					 * where we replace more than a single token.
-					 *
-					 * Do not compensate for the core system's special-case
-					 * adjustment of location to that of the leading '-'
-					 * operator in the event of a negative constant.  It is
-					 * also useful for our purposes to start from the minus
-					 * symbol.  In this way, queries like "select * from foo
-					 * where bar = 1" and "select * from foo where bar = -2"
-					 * will have identical normalized query strings.
-					 */
-					tok = core_yylex(&yylval, &yylloc, yyscanner);
-					if (tok == 0)
-						break;	/* out of inner for-loop */
-				}
-
-				/*
-				 * We now rely on the assumption that flex has placed a zero
-				 * byte after the text of the current token in scanbuf.
-				 */
-				locs[i].length = strlen(yyextra.scanbuf + loc);
-				break;			/* out of inner for-loop */
-			}
-		}
-
-		/* If we hit end-of-string, give up, leaving remaining lengths -1 */
-		if (tok == 0)
-			break;
-	}
-
-	scanner_finish(yyscanner);
-}
-
-/*
- * comp_location: comparator for qsorting LocationLen structs by location
- */
-static int
-comp_location(const void *a, const void *b)
-{
-	int			l = ((const LocationLen *) a)->location;
-	int			r = ((const LocationLen *) b)->location;
-
-	return pg_cmp_s32(l, r);
-}
diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c
index ffc230af427..9dd43a7456f 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -40,10 +40,12 @@
 #include "access/transam.h"
 #include "catalog/pg_proc.h"
 #include "common/hashfn.h"
+#include "common/int.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
 #include "nodes/queryjumble.h"
 #include "utils/lsyscache.h"
+#include "parser/scanner.h"
 #include "parser/scansup.h"
 
 #define JUMBLE_SIZE				1024	/* query serialization buffer size */
@@ -68,6 +70,8 @@ static void FlushPendingNulls(JumbleState *jstate);
 static void RecordConstLocation(JumbleState *jstate,
 								bool extern_param,
 								int location, int len);
+static char *GenerateNormalizedQuery(JumbleState *jstate, const char *query,
+									 int query_loc, int *query_len_p);
 static void _jumbleNode(JumbleState *jstate, Node *node);
 static void _jumbleList(JumbleState *jstate, Node *node);
 static void _jumbleElements(JumbleState *jstate, List *elements, Node *node);
@@ -196,6 +200,7 @@ InitJumble(void)
 #ifdef USE_ASSERT_CHECKING
 	jstate->total_jumble_len = 0;
 #endif
+	jstate->normalize_query = GenerateNormalizedQuery;
 
 	return jstate;
 }
@@ -773,3 +778,249 @@ _jumbleRangeTblEntry_eref(JumbleState *jstate,
 	 */
 	JUMBLE_STRING(aliasname);
 }
+
+/*
+ * CompLocation: comparator for qsorting LocationLen structs by location
+ */
+static int
+CompLocation(const void *a, const void *b)
+{
+	int			l = ((const LocationLen *) a)->location;
+	int			r = ((const LocationLen *) b)->location;
+
+	return pg_cmp_s32(l, r);
+}
+
+/*
+ * Given a valid SQL string and an array of constant-location records,
+ * fill in the textual lengths of those constants in JumbleState.
+ *
+ * The constants may use any allowed constant syntax, such as float literals,
+ * bit-strings, single-quoted strings and dollar-quoted strings.  This is
+ * accomplished by using the public API for the core scanner.
+ *
+ * Multiple constants can have the same location.  We reset lengths of those
+ * past the first to -1 so that they can later be ignored.
+ *
+ * N.B. There is an assumption that a '-' character at a Const location begins
+ * a negative numeric constant.  This precludes there ever being another
+ * reason for a constant to start with a '-'.
+ */
+static void
+SetConstantLengths(JumbleState *jstate, const char *query,
+				   int query_loc)
+{
+	LocationLen *locs;
+	core_yyscan_t yyscanner;
+	core_yy_extra_type yyextra;
+	core_YYSTYPE yylval;
+	YYLTYPE		yylloc;
+
+	/*
+	 * Sort the records by location so that we can process them in order while
+	 * scanning the query text.
+	 */
+	if (jstate->clocations_count > 1)
+		qsort(jstate->clocations, jstate->clocations_count,
+			  sizeof(LocationLen), CompLocation);
+	locs = jstate->clocations;
+
+	/* initialize the flex scanner --- should match raw_parser() */
+	yyscanner = scanner_init(query,
+							 &yyextra,
+							 &ScanKeywords,
+							 ScanKeywordTokens);
+
+	/* we don't want to re-emit any escape string warnings */
+	yyextra.escape_string_warning = false;
+
+	/* Search for each constant, in sequence */
+	for (int i = 0; i < jstate->clocations_count; i++)
+	{
+		int			loc;
+		int			tok;
+
+		/* Ignore constants after the first one in the same location */
+		if (i > 0 && locs[i].location == locs[i - 1].location)
+		{
+			locs[i].length = -1;
+			continue;
+		}
+
+		if (locs[i].squashed)
+			continue;			/* squashable list, ignore */
+
+		/*
+		 * Adjust the constant's location using the provided starting location
+		 * of the current statement. This allows us to avoid scanning a
+		 * multi-statement string from the beginning.
+		 */
+		loc = locs[i].location - query_loc;
+		Assert(loc >= 0);
+
+		/*
+		 * We have a valid location for a constant that's not a dupe. Lex
+		 * tokens until we find the desired constant.
+		 */
+		for (;;)
+		{
+			tok = core_yylex(&yylval, &yylloc, yyscanner);
+
+			/* We should not hit end-of-string, but if we do, behave sanely */
+			if (tok == 0)
+				break;			/* out of inner for-loop */
+
+			/*
+			 * We should find the token position exactly, but if we somehow
+			 * run past it, work with that.
+			 */
+			if (yylloc >= loc)
+			{
+				if (query[loc] == '-')
+				{
+					/*
+					 * It's a negative value - this is the one and only case
+					 * where we replace more than a single token.
+					 *
+					 * Do not compensate for the special-case adjustment of
+					 * location to that of the leading '-' operator in the
+					 * event of a negative constant (see doNegate() in
+					 * gram.y).  It is also useful for our purposes to start
+					 * from the minus symbol.  In this way, queries like
+					 * "select * from foo where bar = 1" and "select * from
+					 * foo where bar = -2" will have identical normalized
+					 * query strings.
+					 */
+					tok = core_yylex(&yylval, &yylloc, yyscanner);
+					if (tok == 0)
+						break;	/* out of inner for-loop */
+				}
+
+				/*
+				 * We now rely on the assumption that flex has placed a zero
+				 * byte after the text of the current token in scanbuf.
+				 */
+				locs[i].length = strlen(yyextra.scanbuf + loc);
+				break;			/* out of inner for-loop */
+			}
+		}
+
+		/* If we hit end-of-string, give up, leaving remaining lengths -1 */
+		if (tok == 0)
+			break;
+	}
+
+	scanner_finish(yyscanner);
+}
+
+/*
+ * Callback to generate a normalized version of the query string that will be used to
+ * represent all similar queries.
+ *
+ * It is the caller's job to ensure that the string is a valid SQL statement
+ * with the correct constant locations in jstate->clocations. Since in
+ * practice the string has already been parsed using the authoritative parser
+ * and the locations are set by core query jumbling, this should not be a problem.
+ *
+ * *query_len_p contains the input string length, and is updated with
+ * the result string length on exit.  The resulting string might be longer
+ * or shorter depending on what happens with replacement of constants.
+ *
+ * Returns a palloc'd string.
+ */
+static char *
+GenerateNormalizedQuery(JumbleState *jstate, const char *query,
+						int query_loc, int *query_len_p)
+{
+	char	   *norm_query;
+	int			query_len = *query_len_p;
+	int			norm_query_buflen,	/* Space allowed for norm_query */
+				len_to_wrt,		/* Length (in bytes) to write */
+				quer_loc = 0,	/* Source query byte location */
+				n_quer_loc = 0, /* Normalized query byte location */
+				last_off = 0,	/* Offset from start for previous tok */
+				last_tok_len = 0;	/* Length (in bytes) of that tok */
+	int			num_constants_replaced = 0;
+
+	/*
+	 * Set constants' lengths in JumbleState, as only locations are set during
+	 * DoJumble(). Note this also ensures the items are sorted by location.
+	 */
+	SetConstantLengths((JumbleState *) jstate, query, query_loc);
+
+	/*
+	 * Allow for $n symbols to be longer than the constants they replace.
+	 * Constants must take at least one byte in text form, while a $n symbol
+	 * certainly isn't more than 11 bytes, even if n reaches INT_MAX.  We
+	 * could refine that limit based on the max value of n for the current
+	 * query, but it hardly seems worth any extra effort to do so.
+	 */
+	norm_query_buflen = query_len + jstate->clocations_count * 10;
+
+	/* Allocate result buffer */
+	norm_query = palloc(norm_query_buflen + 1);
+
+	for (int i = 0; i < jstate->clocations_count; i++)
+	{
+		int			off,		/* Offset from start for cur tok */
+					tok_len;	/* Length (in bytes) of that tok */
+
+		/*
+		 * If we have an external param at this location, but no lists are
+		 * being squashed across the query, then we skip here; this will make
+		 * us print the characters found in the original query that represent
+		 * the parameter in the next iteration (or after the loop is done),
+		 * which is a bit odd but seems to work okay in most cases.
+		 */
+		if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists)
+			continue;
+
+		off = jstate->clocations[i].location;
+
+		/* Adjust the constant's location (see SetConstantLengths) */
+		off -= query_loc;
+
+		tok_len = jstate->clocations[i].length;
+
+		if (tok_len < 0)
+			continue;			/* ignore any duplicates */
+
+		/* Copy next chunk (what precedes the next constant) */
+		len_to_wrt = off - last_off;
+		len_to_wrt -= last_tok_len;
+		Assert(len_to_wrt >= 0);
+		memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
+		n_quer_loc += len_to_wrt;
+
+		/*
+		 * And insert a param symbol in place of the constant token; and, if
+		 * we have a squashable list, insert a placeholder comment starting
+		 * from the list's second value.
+		 */
+		n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s",
+							  num_constants_replaced + 1 + jstate->highest_extern_param_id,
+							  jstate->clocations[i].squashed ? " /*, ... */" : "");
+		num_constants_replaced++;
+
+		/* move forward */
+		quer_loc = off + tok_len;
+		last_off = off;
+		last_tok_len = tok_len;
+	}
+
+	/*
+	 * We've copied up until the last ignorable constant.  Copy over the
+	 * remaining bytes of the original query string.
+	 */
+	len_to_wrt = query_len - quer_loc;
+
+	Assert(len_to_wrt >= 0);
+	memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
+	n_quer_loc += len_to_wrt;
+
+	Assert(n_quer_loc <= norm_query_buflen);
+	norm_query[n_quer_loc] = '\0';
+
+	*query_len_p = n_quer_loc;
+	return norm_query;
+}
diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h
index dcb36dcb44f..a756d9b2446 100644
--- a/src/include/nodes/queryjumble.h
+++ b/src/include/nodes/queryjumble.h
@@ -75,6 +75,15 @@ typedef struct JumbleState
 	/* The total number of bytes added to the jumble buffer */
 	Size		total_jumble_len;
 #endif
+
+	/*
+	 * Internal callback for query normalization. The callback may modify
+	 * internal state (e.g., filling in constant lengths). This is set to
+	 * GenerateNormalizedQuery during InitJumble(), but third-party plugins
+	 * that want to provide their own normalization may modify it.
+	 */
+	char	   *(*normalize_query) (struct JumbleState *jstate,
+									const char *query, int query_loc, int *query_len_p);
 } JumbleState;
 
 /* Values for the compute_query_id GUC */
-- 
2.43.0

