From 79d89769ddb518c998b932f4b93b949535939e5d Mon Sep 17 00:00:00 2001
From: David Rowley <dgrowley@gmail.com>
Date: Fri, 3 May 2024 21:15:01 +1200
Subject: [PATCH v2 2/2] Optimize memory management and increase performance of
 Materialize

Here we make tuplestore.c use a generation.c memory context rather than
allocating tuples into the ExecutorState memory context.  Doing that
could cause that context to become bloated when pfree'd chunks are not
reused by future tuples.  This speeds up users of tuplestore.c such as
the Materialize, WindowAgg and CTE Scan executor nodes.  Some benchmarks
have shown up to a 22% performance increase in a query containing a
Materialize node, but much higher gains are possible if the memory
reduction prevents spilling to disk.  This is especially true for
WindowAgg nodes where improvements of several thousand times are
possible.

A generation.c memory context is much better suited for this job as it
works well with FIFO palloc/pfree patterns and consumes less space as
there is no rounding allocations up to the next power-of-2 value as
there is with the aset.c context used for ExecutorState.

This also allows us to more efficiently cleanup memory used by the
tuplestore as we can reset the generation context rather than looping
over all stored tuples and pfree'ing them.

Also, remove a badly placed USEMEM call in readtup_heap().  The tuple
wasn't being allocated in the Tuplestorestate's context, so no need to
adjust the memory consumed by the tuplestore there.

Discussion: https://postgr.es/m/CAApHDvp5Py9g4Rjq7_inL3-MCK1Co2CRt_YWFwTU2zfQix0p4A%40mail.gmail.com
---
 src/backend/utils/sort/tuplestore.c | 52 ++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c
index 24bb49ca87..551f0f0a19 100644
--- a/src/backend/utils/sort/tuplestore.c
+++ b/src/backend/utils/sort/tuplestore.c
@@ -266,7 +266,11 @@ tuplestore_begin_common(int eflags, bool interXact, int maxKBytes)
 	state->availMem = state->allowedMem;
 	state->maxSpace = 0;
 	state->myfile = NULL;
-	state->context = CurrentMemoryContext;
+
+	/* palloc/pfree uses FIFO pattern, which is ideal for generation.c */
+	state->context = GenerationContextCreate(CurrentMemoryContext,
+											 "tuplestore tuples",
+											 ALLOCSET_DEFAULT_SIZES);
 	state->resowner = CurrentResourceOwner;
 
 	state->memtupdeleted = 0;
@@ -429,14 +433,38 @@ tuplestore_clear(Tuplestorestate *state)
 	if (state->myfile)
 		BufFileClose(state->myfile);
 	state->myfile = NULL;
-	if (state->memtuples)
+
+#ifdef USE_ASSERT_CHECKING
 	{
+		int64		availMem = state->availMem;
+
+		/*
+		 * Below, we reset the memory context for storing tuples.  To save
+		 * from having to always call GetMemoryChunkSpace() on all stored
+		 * tuples, we adjust the availMem to forget all the tuples and just
+		 * recall USEMEM for the space used by the memtuples array.  Here we
+		 * just Assert that's correct and the memory tracking hasn't gone
+		 * wrong anywhere.
+		 */
 		for (i = state->memtupdeleted; i < state->memtupcount; i++)
-		{
-			FREEMEM(state, GetMemoryChunkSpace(state->memtuples[i]));
-			pfree(state->memtuples[i]);
-		}
+			availMem += GetMemoryChunkSpace(state->memtuples[i]);
+
+		availMem += GetMemoryChunkSpace(state->memtuples);
+
+		Assert(availMem == state->allowedMem);
 	}
+#endif
+
+	/* clear the memory consumed by the memory tuples */
+	MemoryContextReset(state->context);
+
+	/*
+	 * Zero the used memory and re-consume the space for the memtuples array.
+	 * This saves having to FREEMEM for each stored tuple.
+	 */
+	state->availMem = state->allowedMem;
+	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
+
 	state->status = TSS_INMEM;
 	state->truncated = false;
 	state->memtupdeleted = 0;
@@ -458,16 +486,11 @@ tuplestore_clear(Tuplestorestate *state)
 void
 tuplestore_end(Tuplestorestate *state)
 {
-	int			i;
-
 	if (state->myfile)
 		BufFileClose(state->myfile);
-	if (state->memtuples)
-	{
-		for (i = state->memtupdeleted; i < state->memtupcount; i++)
-			pfree(state->memtuples[i]);
-		pfree(state->memtuples);
-	}
+
+	MemoryContextDelete(state->context);
+	pfree(state->memtuples);
 	pfree(state->readptrs);
 	pfree(state);
 }
@@ -1578,7 +1601,6 @@ readtup_heap(Tuplestorestate *state, unsigned int len)
 	MinimalTuple tuple = (MinimalTuple) palloc(tuplen);
 	char	   *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
 
-	USEMEM(state, GetMemoryChunkSpace(tuple));
 	/* read in the tuple proper */
 	tuple->t_len = tuplen;
 	BufFileReadExact(state->myfile, tupbody, tupbodylen);
-- 
2.34.1

