From 956c0a456a149f0e687d118b8c6ddad8e42df828 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-46-230.ec2.internal>
Date: Fri, 24 Oct 2025 20:42:30 +0000
Subject: [PATCH v4 2/2] pg_stat_statements: move out jumble-specific routines

During discussion of commit 8eddb06, it was noted that the
fill_in_constant_lengths and generate_normalized_query routines
depend on intricate jumbling knowledge. Implementing this code
within pg_stat_statements does not make sense; it belongs in the
core jumbling module, queryjumblefuncs.c. pg_stat_statements can
then consume these routines without needing to know the details.

This separation became especially apparent with the squashing of
constant lists, but it has been a general issue all along.

In addition to reducing the pg_stat_statements codebase, this
change makes these routines available to other extensions
performing similar operations.

Discussion: https://www.postgresql.org/message-id/CAA5RZ0t6rynOzvABTbHZPF9ydKoay5LN%2B_iM3hHk0ni_Qu_9tg%40mail.gmail.com
---
 .../pg_stat_statements/pg_stat_statements.c   | 274 +-----------------
 src/backend/nodes/queryjumblefuncs.c          | 258 +++++++++++++++++
 src/include/nodes/queryjumble.h               |   4 +
 3 files changed, 269 insertions(+), 267 deletions(-)

diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 1d22dc07da9..95293d3b594 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -50,7 +50,6 @@
 #include "access/htup_details.h"
 #include "access/parallel.h"
 #include "catalog/pg_authid.h"
-#include "common/int.h"
 #include "executor/instrument.h"
 #include "funcapi.h"
 #include "jit/jit.h"
@@ -59,7 +58,6 @@
 #include "nodes/queryjumble.h"
 #include "optimizer/planner.h"
 #include "parser/analyze.h"
-#include "parser/scanner.h"
 #include "pgstat.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
@@ -377,12 +375,6 @@ static char *qtext_fetch(Size query_offset, int query_len,
 static bool need_gc_qtexts(void);
 static void gc_qtexts(void);
 static TimestampTz entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only);
-static char *generate_normalized_query(JumbleState *jstate, const char *query,
-									   int query_loc, int *query_len_p);
-static void fill_in_constant_lengths(JumbleState *jstate, const char *query,
-									 int query_loc);
-static int	comp_location(const void *a, const void *b);
-
 
 /*
  * Module load callback
@@ -1359,6 +1351,13 @@ pgss_store(const char *query, int64 queryId,
 		if (jstate)
 		{
 			LWLockRelease(pgss->lock);
+
+			/*
+			 * generate the normalized query. Note that the normalized
+			 * representation may well vary depending on just which
+			 * "equivalent" query is used to create the hashtable entry. We
+			 * assume this is OK.
+			 */
 			norm_query = generate_normalized_query(jstate, query,
 												   query_location,
 												   &query_len);
@@ -2823,262 +2822,3 @@ release_lock:
 
 	return stats_reset;
 }
-
-/*
- * Generate a normalized version of the query string that will be used to
- * represent all similar queries.
- *
- * Note that the normalized representation may well vary depending on
- * just which "equivalent" query is used to create the hashtable entry.
- * We assume this is OK.
- *
- * If query_loc > 0, then "query" has been advanced by that much compared to
- * the original string start, so we need to translate the provided locations
- * to compensate.  (This lets us avoid re-scanning statements before the one
- * of interest, so it's worth doing.)
- *
- * *query_len_p contains the input string length, and is updated with
- * the result string length on exit.  The resulting string might be longer
- * or shorter depending on what happens with replacement of constants.
- *
- * Returns a palloc'd string.
- */
-static char *
-generate_normalized_query(JumbleState *jstate, const char *query,
-						  int query_loc, int *query_len_p)
-{
-	char	   *norm_query;
-	int			query_len = *query_len_p;
-	int			norm_query_buflen,	/* Space allowed for norm_query */
-				len_to_wrt,		/* Length (in bytes) to write */
-				quer_loc = 0,	/* Source query byte location */
-				n_quer_loc = 0, /* Normalized query byte location */
-				last_off = 0,	/* Offset from start for previous tok */
-				last_tok_len = 0;	/* Length (in bytes) of that tok */
-	int			num_constants_replaced = 0;
-
-	/*
-	 * Get constants' lengths (core system only gives us locations).  Note
-	 * this also ensures the items are sorted by location.
-	 */
-	fill_in_constant_lengths(jstate, query, query_loc);
-
-	/*
-	 * Allow for $n symbols to be longer than the constants they replace.
-	 * Constants must take at least one byte in text form, while a $n symbol
-	 * certainly isn't more than 11 bytes, even if n reaches INT_MAX.  We
-	 * could refine that limit based on the max value of n for the current
-	 * query, but it hardly seems worth any extra effort to do so.
-	 */
-	norm_query_buflen = query_len + jstate->clocations_count * 10;
-
-	/* Allocate result buffer */
-	norm_query = palloc(norm_query_buflen + 1);
-
-	for (int i = 0; i < jstate->clocations_count; i++)
-	{
-		int			off,		/* Offset from start for cur tok */
-					tok_len;	/* Length (in bytes) of that tok */
-
-		/*
-		 * If we have an external param at this location, but no lists are
-		 * being squashed across the query, then we skip here; this will make
-		 * us print the characters found in the original query that represent
-		 * the parameter in the next iteration (or after the loop is done),
-		 * which is a bit odd but seems to work okay in most cases.
-		 */
-		if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists)
-			continue;
-
-		off = jstate->clocations[i].location;
-
-		/* Adjust recorded location if we're dealing with partial string */
-		off -= query_loc;
-
-		tok_len = jstate->clocations[i].length;
-
-		if (tok_len < 0)
-			continue;			/* ignore any duplicates */
-
-		/* Copy next chunk (what precedes the next constant) */
-		len_to_wrt = off - last_off;
-		len_to_wrt -= last_tok_len;
-		Assert(len_to_wrt >= 0);
-		memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
-		n_quer_loc += len_to_wrt;
-
-		/*
-		 * And insert a param symbol in place of the constant token; and, if
-		 * we have a squashable list, insert a placeholder comment starting
-		 * from the list's second value.
-		 */
-		n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s",
-							  num_constants_replaced + 1 + jstate->highest_extern_param_id,
-							  jstate->clocations[i].squashed ? " /*, ... */" : "");
-		num_constants_replaced++;
-
-		/* move forward */
-		quer_loc = off + tok_len;
-		last_off = off;
-		last_tok_len = tok_len;
-	}
-
-	/*
-	 * We've copied up until the last ignorable constant.  Copy over the
-	 * remaining bytes of the original query string.
-	 */
-	len_to_wrt = query_len - quer_loc;
-
-	Assert(len_to_wrt >= 0);
-	memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
-	n_quer_loc += len_to_wrt;
-
-	Assert(n_quer_loc <= norm_query_buflen);
-	norm_query[n_quer_loc] = '\0';
-
-	*query_len_p = n_quer_loc;
-	return norm_query;
-}
-
-/*
- * Given a valid SQL string and an array of constant-location records,
- * fill in the textual lengths of those constants.
- *
- * The constants may use any allowed constant syntax, such as float literals,
- * bit-strings, single-quoted strings and dollar-quoted strings.  This is
- * accomplished by using the public API for the core scanner.
- *
- * It is the caller's job to ensure that the string is a valid SQL statement
- * with constants at the indicated locations.  Since in practice the string
- * has already been parsed, and the locations that the caller provides will
- * have originated from within the authoritative parser, this should not be
- * a problem.
- *
- * Duplicate constant pointers are possible, and will have their lengths
- * marked as '-1', so that they are later ignored.
- *
- * If query_loc > 0, then "query" has been advanced by that much compared to
- * the original string start, so we need to translate the provided locations
- * to compensate.  (This lets us avoid re-scanning statements before the one
- * of interest, so it's worth doing.)
- *
- * N.B. There is an assumption that a '-' character at a Const location begins
- * a negative numeric constant.  This precludes there ever being another
- * reason for a constant to start with a '-'.
- */
-static void
-fill_in_constant_lengths(JumbleState *jstate, const char *query,
-						 int query_loc)
-{
-	LocationLen *locs;
-	core_yyscan_t yyscanner;
-	core_yy_extra_type yyextra;
-	core_YYSTYPE yylval;
-	YYLTYPE		yylloc;
-	int			last_loc = -1;
-	int			i;
-
-	/*
-	 * Sort the records by location so that we can process them in order while
-	 * scanning the query text.
-	 */
-	if (jstate->clocations_count > 1)
-		qsort(jstate->clocations, jstate->clocations_count,
-			  sizeof(LocationLen), comp_location);
-	locs = jstate->clocations;
-
-	/* initialize the flex scanner --- should match raw_parser() */
-	yyscanner = scanner_init(query,
-							 &yyextra,
-							 &ScanKeywords,
-							 ScanKeywordTokens);
-
-	/* we don't want to re-emit any escape string warnings */
-	yyextra.escape_string_warning = false;
-
-	/* Search for each constant, in sequence */
-	for (i = 0; i < jstate->clocations_count; i++)
-	{
-		int			loc = locs[i].location;
-		int			tok;
-		bool		squashed = locs[i].squashed;
-
-		/* Adjust recorded location if we're dealing with partial string */
-		loc -= query_loc;
-
-		Assert(loc >= 0);
-
-		if (loc <= last_loc)
-		{
-			locs[i].length = -1;
-			continue;			/* Duplicate constant, ignore */
-		}
-
-		/* We have a valid location, so let's save it */
-		last_loc = loc;
-
-		if (squashed)
-			continue;			/* squashable list, ignore */
-
-		/* Lex tokens until we find the desired constant */
-		for (;;)
-		{
-			tok = core_yylex(&yylval, &yylloc, yyscanner);
-
-			/* We should not hit end-of-string, but if we do, behave sanely */
-			if (tok == 0)
-				break;			/* out of inner for-loop */
-
-			/*
-			 * We should find the token position exactly, but if we somehow
-			 * run past it, work with that.
-			 */
-			if (yylloc >= loc)
-			{
-				if (query[loc] == '-')
-				{
-					/*
-					 * It's a negative value - this is the one and only case
-					 * where we replace more than a single token.
-					 *
-					 * Do not compensate for the core system's special-case
-					 * adjustment of location to that of the leading '-'
-					 * operator in the event of a negative constant.  It is
-					 * also useful for our purposes to start from the minus
-					 * symbol.  In this way, queries like "select * from foo
-					 * where bar = 1" and "select * from foo where bar = -2"
-					 * will have identical normalized query strings.
-					 */
-					tok = core_yylex(&yylval, &yylloc, yyscanner);
-					if (tok == 0)
-						break;	/* out of inner for-loop */
-				}
-
-				/*
-				 * We now rely on the assumption that flex has placed a zero
-				 * byte after the text of the current token in scanbuf.
-				 */
-				locs[i].length = strlen(yyextra.scanbuf + loc);
-				break;			/* out of inner for-loop */
-			}
-		}
-
-		/* If we hit end-of-string, give up, leaving remaining lengths -1 */
-		if (tok == 0)
-			break;
-	}
-
-	scanner_finish(yyscanner);
-}
-
-/*
- * comp_location: comparator for qsorting LocationLen structs by location
- */
-static int
-comp_location(const void *a, const void *b)
-{
-	int			l = ((const LocationLen *) a)->location;
-	int			r = ((const LocationLen *) b)->location;
-
-	return pg_cmp_s32(l, r);
-}
diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c
index 31f97151977..c27f1c12ac7 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -40,10 +40,12 @@
 #include "access/transam.h"
 #include "catalog/pg_proc.h"
 #include "common/hashfn.h"
+#include "common/int.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
 #include "nodes/queryjumble.h"
 #include "utils/lsyscache.h"
+#include "parser/scanner.h"
 #include "parser/scansup.h"
 
 #define JUMBLE_SIZE				1024	/* query serialization buffer size */
@@ -77,6 +79,262 @@ static void _jumbleVariableSetStmt(JumbleState *jstate, Node *node);
 static void _jumbleRangeTblEntry_eref(JumbleState *jstate,
 									  RangeTblEntry *rte,
 									  Alias *expr);
+static int	comp_location(const void *a, const void *b);
+
+/*
+ * comp_location: comparator for qsorting LocationLen structs by location
+ */
+static int
+comp_location(const void *a, const void *b)
+{
+	int			l = ((const LocationLen *) a)->location;
+	int			r = ((const LocationLen *) b)->location;
+
+	return pg_cmp_s32(l, r);
+}
+
+ /*
+  * Generate a normalized version of the query string that will be used to
+  * represent all similar queries.
+  *
+  * If query_loc > 0, then "query" has been advanced by that much compared to
+  * the original string start, so we need to translate the provided locations
+  * to compensate.  (This lets us avoid re-scanning statements before the one
+  * of interest, so it's worth doing.)
+  *
+  * *query_len_p contains the input string length, and is updated with the
+  * result string length on exit.  The resulting string might be longer or
+  * shorter depending on what happens with replacement of constants.
+  *
+  * Returns a palloc'd string.
+  */
+char *
+generate_normalized_query(JumbleState *jstate, const char *query,
+						  int query_loc, int *query_len_p)
+{
+	char	   *norm_query;
+	int			query_len = *query_len_p;
+	int			norm_query_buflen,	/* Space allowed for norm_query */
+				len_to_wrt,		/* Length (in bytes) to write */
+				quer_loc = 0,	/* Source query byte location */
+				n_quer_loc = 0, /* Normalized query byte location */
+				last_off = 0,	/* Offset from start for previous tok */
+				last_tok_len = 0;	/* Length (in bytes) of that tok */
+	int			num_constants_replaced = 0;
+
+	/*
+	 * Get constants' lengths (core system only gives us locations).  Note
+	 * this also ensures the items are sorted by location.
+	 */
+	fill_in_constant_lengths(jstate, query, query_loc);
+
+	/*
+	 * Allow for $n symbols to be longer than the constants they replace.
+	 * Constants must take at least one byte in text form, while a $n symbol
+	 * certainly isn't more than 11 bytes, even if n reaches INT_MAX.  We
+	 * could refine that limit based on the max value of n for the current
+	 * query, but it hardly seems worth any extra effort to do so.
+	 */
+	norm_query_buflen = query_len + jstate->clocations_count * 10;
+
+	/* Allocate result buffer */
+	norm_query = palloc(norm_query_buflen + 1);
+
+	for (int i = 0; i < jstate->clocations_count; i++)
+	{
+		int			off,		/* Offset from start for cur tok */
+					tok_len;	/* Length (in bytes) of that tok */
+
+		/*
+		 * If we have an external param at this location, but no lists are
+		 * being squashed across the query, then we skip here; this will make
+		 * us print the characters found in the original query that represent
+		 * the parameter in the next iteration (or after the loop is done),
+		 * which is a bit odd but seems to work okay in most cases.
+		 */
+		if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists)
+			continue;
+
+		off = jstate->clocations[i].location;
+
+		/* Adjust recorded location if we're dealing with partial string */
+		off -= query_loc;
+
+		tok_len = jstate->clocations[i].length;
+
+		if (tok_len < 0)
+			continue;			/* ignore any duplicates */
+
+		/* Copy next chunk (what precedes the next constant) */
+		len_to_wrt = off - last_off;
+		len_to_wrt -= last_tok_len;
+		Assert(len_to_wrt >= 0);
+		memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
+		n_quer_loc += len_to_wrt;
+
+		/*
+		 * And insert a param symbol in place of the constant token; and, if
+		 * we have a squashable list, insert a placeholder comment starting
+		 * from the list's second value.
+		 */
+		n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s",
+							  num_constants_replaced + 1 + jstate->highest_extern_param_id,
+							  jstate->clocations[i].squashed ? " /*, ... */" : "");
+		num_constants_replaced++;
+
+		/* move forward */
+		quer_loc = off + tok_len;
+		last_off = off;
+		last_tok_len = tok_len;
+	}
+
+	/*
+	 * We've copied up until the last ignorable constant.  Copy over the
+	 * remaining bytes of the original query string.
+	 */
+	len_to_wrt = query_len - quer_loc;
+
+	Assert(len_to_wrt >= 0);
+	memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
+	n_quer_loc += len_to_wrt;
+
+	Assert(n_quer_loc <= norm_query_buflen);
+	norm_query[n_quer_loc] = '\0';
+
+	*query_len_p = n_quer_loc;
+	return norm_query;
+}
+
+/*
+ * Given a valid SQL string and an array of constant-location records,
+ * fill in the textual lengths of those constants.
+ *
+ * The constants may use any allowed constant syntax, such as float literals,
+ * bit-strings, single-quoted strings and dollar-quoted strings.  This is
+ * accomplished by using the public API for the core scanner.
+ *
+ * It is the caller's job to ensure that the string is a valid SQL statement
+ * with constants at the indicated locations.  Since in practice the string
+ * has already been parsed, and the locations that the caller provides will
+ * have originated from within the authoritative parser, this should not be
+ * a problem.
+ *
+ * Duplicate constant pointers are possible, and will have their lengths
+ * marked as '-1', so that they are later ignored.
+ *
+ * If query_loc > 0, then "query" has been advanced by that much compared to
+ * the original string start, so we need to translate the provided locations
+ * to compensate.  (This lets us avoid re-scanning statements before the one
+ * of interest, so it's worth doing.)
+ *
+ * N.B. There is an assumption that a '-' character at a Const location begins
+ * a negative numeric constant.  This precludes there ever being another
+ * reason for a constant to start with a '-'.
+ */
+void
+fill_in_constant_lengths(JumbleState *jstate, const char *query,
+						 int query_loc)
+{
+	LocationLen *locs;
+	core_yyscan_t yyscanner;
+	core_yy_extra_type yyextra;
+	core_YYSTYPE yylval;
+	YYLTYPE		yylloc;
+	int			last_loc = -1;
+	int			i;
+
+	/*
+	 * Sort the records by location so that we can process them in order while
+	 * scanning the query text.
+	 */
+	if (jstate->clocations_count > 1)
+		qsort(jstate->clocations, jstate->clocations_count,
+			  sizeof(LocationLen), comp_location);
+	locs = jstate->clocations;
+
+	/* initialize the flex scanner --- should match raw_parser() */
+	yyscanner = scanner_init(query,
+							 &yyextra,
+							 &ScanKeywords,
+							 ScanKeywordTokens);
+
+	/* we don't want to re-emit any escape string warnings */
+	yyextra.escape_string_warning = false;
+
+	/* Search for each constant, in sequence */
+	for (i = 0; i < jstate->clocations_count; i++)
+	{
+		int			loc = locs[i].location;
+		int			tok;
+		bool		squashed = locs[i].squashed;
+
+		/* Adjust recorded location if we're dealing with partial string */
+		loc -= query_loc;
+
+		Assert(loc >= 0);
+
+		if (loc <= last_loc)
+		{
+			locs[i].length = -1;
+			continue;			/* Duplicate constant, ignore */
+		}
+
+		/* We have a valid location, so let's save it */
+		last_loc = loc;
+
+		if (squashed)
+			continue;			/* squashable list, ignore */
+
+		/* Lex tokens until we find the desired constant */
+		for (;;)
+		{
+			tok = core_yylex(&yylval, &yylloc, yyscanner);
+
+			/* We should not hit end-of-string, but if we do, behave sanely */
+			if (tok == 0)
+				break;			/* out of inner for-loop */
+
+			/*
+			 * We should find the token position exactly, but if we somehow
+			 * run past it, work with that.
+			 */
+			if (yylloc >= loc)
+			{
+				if (query[loc] == '-')
+				{
+					/*
+					 * It's a negative value - this is the one and only case
+					 * where we replace more than a single token.
+					 *
+					 * Do not compensate for the core system's special-case
+					 * adjustment of location to that of the leading '-'
+					 * operator in the event of a negative constant.  It is
+					 * also useful for our purposes to start from the minus
+					 * symbol.  In this way, queries like "select * from foo
+					 * where bar = 1" and "select * from foo where bar = -2"
+					 * will have identical normalized query strings.
+					 */
+					tok = core_yylex(&yylval, &yylloc, yyscanner);
+					if (tok == 0)
+						break;	/* out of inner for-loop */
+				}
+
+				/*
+				 * We now rely on the assumption that flex has placed a zero
+				 * byte after the text of the current token in scanbuf.
+				 */
+				locs[i].length = strlen(yyextra.scanbuf + loc);
+				break;			/* out of inner for-loop */
+			}
+		}
+
+		/* If we hit end-of-string, give up, leaving remaining lengths -1 */
+		if (tok == 0)
+			break;
+	}
+
+	scanner_finish(yyscanner);
+}
 
 /*
  * Given a possibly multi-statement source string, confine our attention to the
diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h
index dcb36dcb44f..8024a0883f7 100644
--- a/src/include/nodes/queryjumble.h
+++ b/src/include/nodes/queryjumble.h
@@ -91,6 +91,10 @@ extern PGDLLIMPORT int compute_query_id;
 
 
 extern const char *CleanQuerytext(const char *query, int *location, int *len);
+extern char *generate_normalized_query(JumbleState *jstate, const char *query,
+									   int query_loc, int *query_len_p);
+extern void fill_in_constant_lengths(JumbleState *jstate, const char *query,
+									 int query_loc);
 extern JumbleState *JumbleQuery(Query *query);
 extern void EnableQueryId(void);
 
-- 
2.43.0

