From 3a129c7765a6066ec164d504bcdc2942ace00959 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Wed, 3 Jan 2024 23:41:44 +0100
Subject: [PATCH v1 7/7] NodeSupport: Apply RLE and differential encoding on
 lists and bitmaps

This reduces raw serialized size of the initdb pg_rewrite.ev_action
dataset by ~1.6%, it's pglz-compressed dataset by 5%/15kB (!), and the
size of the template0 database by another 2 blocks, for a total
improvement of 400kB/49 blocks.
---
 src/backend/nodes/outfuncs.c  | 89 ++++++++++++++++++++++++++++++++---
 src/backend/nodes/read.c      | 38 +++++++++++++--
 src/backend/nodes/readfuncs.c | 15 +++++-
 3 files changed, 132 insertions(+), 10 deletions(-)

diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 83f7e43242..d44e1db3d5 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -337,18 +337,49 @@ static void
 _outList(StringInfo str, const List *node)
 {
 	const ListCell *lc;
+	union LCSurrogate {
+		int32 i;
+		Oid o;
+		TransactionId x;
+	}				previous = { .i = 0};
+	int				run = 0;
+	int				run_delta;
+	const char	   *fmt;
 
 	appendStringInfoChar(str, '(');
 
 	if (IsA(node, IntList))
+	{
 		appendStringInfoChar(str, 'i');
+		fmt = " %d";
+		run_delta = 1;
+	}
 	else if (IsA(node, OidList))
+	{
 		appendStringInfoChar(str, 'o');
+		fmt = " %u";
+		run_delta = 0;
+	}
 	else if (IsA(node, XidList))
+	{
 		appendStringInfoChar(str, 'x');
+		fmt = " %u";
+		run_delta = 1;
+	}
+	else if (IsA(node, List))
+	{
+		/* silence the compiler about uninitialized variables */
+		fmt = "";
+		run_delta = 0;
+	}
+	else
+		elog(ERROR, "unrecognized list node type: %d",
+			 (int) node->type);
 
 	foreach(lc, node)
 	{
+		union LCSurrogate val = {.i = 0};
+
 		/*
 		 * For the sake of backward compatibility, we emit a slightly
 		 * different whitespace format for lists of nodes vs. other types of
@@ -359,18 +390,30 @@ _outList(StringInfo str, const List *node)
 			outNode(str, lfirst(lc));
 			if (lnext(node, lc))
 				appendStringInfoChar(str, ' ');
+			continue;
 		}
 		else if (IsA(node, IntList))
-			appendStringInfo(str, " %d", lfirst_int(lc));
+			val.i = lfirst_int(lc);
 		else if (IsA(node, OidList))
-			appendStringInfo(str, " %u", lfirst_oid(lc));
+			val.o = lfirst_oid(lc);
 		else if (IsA(node, XidList))
-			appendStringInfo(str, " %u", lfirst_xid(lc));
+			val.x = lfirst_xid(lc);
+
+		if (val.i == previous.i + run_delta)
+			run += 1;
 		else
-			elog(ERROR, "unrecognized list node type: %d",
-				 (int) node->type);
+		{
+			if (run > 0)
+				appendStringInfo(str, " +%d", run);
+			run = 0;
+			appendStringInfo(str, fmt, val.i);
+		}
+		previous = val;
 	}
 
+	if (run > 0)
+		appendStringInfo(str, " +%d", run);
+
 	appendStringInfoChar(str, ')');
 }
 
@@ -387,12 +430,46 @@ void
 outBitmapset(StringInfo str, const Bitmapset *bms)
 {
 	int			x;
+	int			prev = 0;
+	int			run = 0;
 
 	appendStringInfoChar(str, '(');
 	appendStringInfoChar(str, 'b');
 	x = -1;
 	while ((x = bms_next_member(bms, x)) >= 0)
-		appendStringInfo(str, " %d", x);
+	{
+		int increment = x - prev;
+		if (increment == 1)
+		{
+			run++;
+		}
+		else if (run != 0)
+		{
+			/*
+			 * If the run is only one value, the '-' sign of run notation is
+			 * larger than just the plain differential value, so only
+			 * write out runs longer than 1.
+			 */
+			if (run == 1)
+				appendStringInfo(str, " %d", 1);
+			else
+				appendStringInfo(str, " +%d", run);
+
+			appendStringInfo(str, " %d", increment);
+			run = 0;
+		}
+		else
+		{
+			appendStringInfo(str, " %d", increment);
+		}
+		prev = x;
+	}
+
+	if (run == 1)
+		appendStringInfo(str, " %d", run);
+	else if (run > 1)
+		appendStringInfo(str, " +%d", run);
+
 	appendStringInfoChar(str, ')');
 }
 
diff --git a/src/backend/nodes/read.c b/src/backend/nodes/read.c
index 2815b268be..362672c24e 100644
--- a/src/backend/nodes/read.c
+++ b/src/backend/nodes/read.c
@@ -402,6 +402,8 @@ nodeRead(const char *token, int tok_len)
 					elog(ERROR, "unterminated List structure");
 				if (tok_len == 1 && token[0] == 'i')
 				{
+					int		prev = 0;
+
 					/* List of integers */
 					for (;;)
 					{
@@ -417,12 +419,23 @@ nodeRead(const char *token, int tok_len)
 						if (endptr != token + tok_len)
 							elog(ERROR, "unrecognized integer: \"%.*s\"",
 								 tok_len, token);
-						l = lappend_int(l, val);
+
+						if (val > 0 && token[0] == '+')
+						{
+							for (int i = 0; i < val; i++)
+								l = lappend_int(l, ++prev);
+						}
+						else
+						{
+							prev = val;
+							l = lappend_int(l, val);
+						}
 					}
 					result = (Node *) l;
 				}
 				else if (tok_len == 1 && token[0] == 'o')
 				{
+					Oid		prev = 0;
 					/* List of OIDs */
 					for (;;)
 					{
@@ -438,12 +451,22 @@ nodeRead(const char *token, int tok_len)
 						if (endptr != token + tok_len)
 							elog(ERROR, "unrecognized OID: \"%.*s\"",
 								 tok_len, token);
-						l = lappend_oid(l, val);
+						if (token[0] == '+')
+						{
+							for (int i = 0; i < val; i++)
+								l = lappend_oid(l, prev);
+						}
+						else
+						{
+							prev = val;
+							l = lappend_oid(l, val);
+						}
 					}
 					result = (Node *) l;
 				}
 				else if (tok_len == 1 && token[0] == 'x')
 				{
+					TransactionId	prev = 0;
 					/* List of TransactionIds */
 					for (;;)
 					{
@@ -459,7 +482,16 @@ nodeRead(const char *token, int tok_len)
 						if (endptr != token + tok_len)
 							elog(ERROR, "unrecognized Xid: \"%.*s\"",
 								 tok_len, token);
-						l = lappend_xid(l, val);
+						if (token[0] == '+')
+						{
+							for (int i = 0; i < val; i++)
+								l = lappend_xid(l, ++prev);
+						}
+						else
+						{
+							prev = val;
+							l = lappend_xid(l, val);
+						}
 					}
 					result = (Node *) l;
 				}
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 19c023fb32..ad96ff6646 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -298,6 +298,7 @@ static Bitmapset *
 _readBitmapset(void)
 {
 	Bitmapset  *result = NULL;
+	int			prev = 0;
 
 	READ_TEMP_LOCALS();
 
@@ -326,7 +327,19 @@ _readBitmapset(void)
 		val = (int) strtol(token, &endptr, 10);
 		if (endptr != token + length)
 			elog(ERROR, "unrecognized integer: \"%.*s\"", length, token);
-		result = bms_add_member(result, val);
+		if (val > 0 && token[0] == '+')
+		{
+			for (int i = 0; i < val; i++)
+			{
+				prev++;
+				result = bms_add_member(result, prev);
+			}
+		}
+		else
+		{
+			prev += val;
+			result = bms_add_member(result, prev);
+		}
 	}
 
 	return result;
-- 
2.40.1

