From 9dbf07f28d257bd6aef8bacb38076f0da9fc8a45 Mon Sep 17 00:00:00 2001
From: ChangAo Chen <cca5507@qq.com>
Date: Fri, 5 Dec 2025 23:48:05 +0800
Subject: [PATCH v2 2/2] Support loser tree for k-way merge.

The loser tree usually has fewer comparisons than the heap during
k-way merge. But the heap maybe better if the cardinality of the
input data is very low. Add a GUC 'enable_loser_tree' to control
the use of loser tree, default to off.
---
 doc/src/sgml/config.sgml                      |  14 ++
 src/backend/utils/misc/guc_parameters.dat     |   7 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/backend/utils/sort/tuplesort.c            | 218 ++++++++++++++----
 src/include/utils/guc.h                       |   1 +
 src/test/regress/expected/sysviews.out        |   3 +-
 6 files changed, 202 insertions(+), 42 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 405c9689bd0..031591bb820 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -5617,6 +5617,20 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-enable-loser-tree" xreflabel="enable_loser_tree">
+      <term><varname>enable_loser_tree</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>enable_loser_tree</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables or disables the external k-way merge's use of loser tree.
+        The default is <literal>off</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-enable-material" xreflabel="enable_material">
       <term><varname>enable_material</varname> (<type>boolean</type>)
       <indexterm>
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index 3b9d8349078..cbece911495 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -882,6 +882,13 @@
   boot_val => 'true',
 },
 
+{ name => 'enable_loser_tree', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
+  short_desc => 'Enables loser tree for external k-way merge.',
+  flags => 'GUC_EXPLAIN',
+  variable => 'enable_loser_tree',
+  boot_val => 'false',
+},
+
 { name => 'enable_material', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
   short_desc => 'Enables the planner\'s use of materialization.',
   flags => 'GUC_EXPLAIN',
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index dc9e2255f8a..2ffba918a43 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -412,6 +412,7 @@
 #enable_incremental_sort = on
 #enable_indexscan = on
 #enable_indexonlyscan = on
+#enable_loser_tree = off
 #enable_material = on
 #enable_memoize = on
 #enable_mergejoin = on
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index e28b8cfa868..57c3d0266dc 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -42,16 +42,16 @@
  * end of the input is reached, we dump out remaining tuples in memory into
  * a final run, then merge the runs.
  *
- * When merging runs, we use a heap containing just the frontmost tuple from
- * each source run; we repeatedly output the smallest tuple and replace it
- * with the next tuple from its source tape (if any).  When the heap empties,
- * the merge is complete.  The basic merge algorithm thus needs very little
- * memory --- only M tuples for an M-way merge, and M is constrained to a
- * small number.  However, we can still make good use of our full workMem
- * allocation by pre-reading additional blocks from each source tape.  Without
- * prereading, our access pattern to the temporary file would be very erratic;
- * on average we'd read one block from each of M source tapes during the same
- * time that we're writing M blocks to the output tape, so there is no
+ * When merging runs, we use a heap or loser tree containing just the frontmost
+ * tuple from each source run; we repeatedly output the smallest tuple and
+ * replace it with the next tuple from its source tape (if any).  When the heap
+ * or loser tree empties, the merge is complete.  The basic merge algorithm thus
+ * needs very little memory --- only M tuples for an M-way merge, and M is
+ * constrained to a small number.  However, we can still make good use of our
+ * full workMem allocation by pre-reading additional blocks from each source tape.
+ * Without prereading, our access pattern to the temporary file would be very
+ * erratic; on average we'd read one block from each of M source tapes during
+ * the same time that we're writing M blocks to the output tape, so there is no
  * sequentiality of access at all, defeating the read-ahead methods used by
  * most Unix kernels.  Worse, the output tape gets written into a very random
  * sequence of blocks of the temp file, ensuring that things will be even
@@ -120,8 +120,12 @@
 #define INITIAL_MEMTUPSIZE Max(1024, \
 	ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1)
 
+/* LOSER_TREE_EOF is always a loser in comparison */
+#define LOSER_TREE_EOF -1
+
 /* GUC variables */
 bool		trace_sort = false;
+bool        enable_loser_tree = false;
 
 #ifdef DEBUG_BOUNDED_SORT
 bool		optimize_bounded_sort = true;
@@ -206,6 +210,8 @@ struct Tuplesortstate
 								 * space */
 	TupSortStatus maxSpaceStatus;	/* sort status when maxSpace was reached */
 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
+	bool        useLoserTree;   /* use loser tree for k-way merge if true */
+	int16      *losers;         /* array of losers, losers[0] is the winner */
 
 	/*
 	 * This array holds the tuples now in sort memory.  If we are in state
@@ -468,6 +474,8 @@ static void tuplesort_sort_memtuples(Tuplesortstate *state);
 static void tuplesort_heap_build(Tuplesortstate *state);
 static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple);
 static void tuplesort_heap_delete_top(Tuplesortstate *state);
+static void tuplesort_loser_tree_build(Tuplesortstate *state);
+static void tuplesort_loser_tree_adjust(Tuplesortstate *state);
 static void reversedirection(Tuplesortstate *state);
 static unsigned int getlen(LogicalTape *tape, bool eofOK);
 static void markrunend(LogicalTape *tape);
@@ -682,6 +690,10 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt)
 	state->base.tuples = true;
 	state->abbrevNext = 10;
 
+	/* Use loser tree for k-way merge if enable_loser_tree is true */
+	state->useLoserTree = enable_loser_tree;
+	state->losers = NULL;
+
 	/*
 	 * workMem is forced to be at least 64KB, the current minimum valid value
 	 * for the work_mem GUC.  This is a defense against parallel sort callers
@@ -1652,11 +1664,12 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 			 */
 			if (state->memtupcount > 0)
 			{
-				int			srcTapeIndex = state->memtuples[0].srctape;
+				int         i = (state->useLoserTree ? state->losers[0] : 0);
+				int			srcTapeIndex = state->memtuples[i].srctape;
 				LogicalTape *srcTape = state->inputTapes[srcTapeIndex];
 				SortTuple	newtup;
 
-				*stup = state->memtuples[0];
+				*stup = state->memtuples[i];
 
 				/*
 				 * Remember the tuple we return, so that we can recycle its
@@ -1665,16 +1678,17 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 				state->lastReturnedTuple = stup->tuple;
 
 				/*
-				 * Pull next tuple from tape, and replace the returned tuple
-				 * at top of the heap with it.
+				 * Pull next tuple from tape, and adjust the heap or loser tree.
 				 */
 				if (!mergereadnext(state, srcTape, &newtup))
 				{
 					/*
 					 * If no more data, we've reached end of run on this tape.
-					 * Remove the top node from the heap.
 					 */
-					tuplesort_heap_delete_top(state);
+					if (state->useLoserTree)
+						state->memtuples[i].srctape = LOSER_TREE_EOF;
+					else
+						tuplesort_heap_delete_top(state);
 					state->nInputRuns--;
 
 					/*
@@ -1682,10 +1696,22 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 					 * anyway, but better to release the memory early.
 					 */
 					LogicalTapeClose(srcTape);
-					return true;
 				}
-				newtup.srctape = srcTapeIndex;
-				tuplesort_heap_replace_top(state, &newtup);
+				else
+				{
+					newtup.srctape = srcTapeIndex;
+					if (state->useLoserTree)
+						state->memtuples[i] = newtup;
+					else
+						tuplesort_heap_replace_top(state, &newtup);
+				}
+
+				if (state->useLoserTree)
+				{
+					tuplesort_loser_tree_adjust(state);
+					if (state->losers[0] == LOSER_TREE_EOF)
+						state->memtupcount = 0;
+				}
 				return true;
 			}
 			return false;
@@ -2066,8 +2092,8 @@ mergeruns(Tuplesortstate *state)
 		init_slab_allocator(state, 0);
 
 	/*
-	 * Allocate a new 'memtuples' array, for the heap.  It will hold one tuple
-	 * from each input tape.
+	 * Allocate a new 'memtuples' array.  It will hold one tuple from each
+	 * input tape.
 	 *
 	 * We could shrink this, too, between passes in a multi-pass merge, but we
 	 * don't bother.  (The initial input tapes are still in outputTapes.  The
@@ -2078,6 +2104,14 @@ mergeruns(Tuplesortstate *state)
 														state->nOutputTapes * sizeof(SortTuple));
 	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
 
+	/* Allocate a 'losers' array if we will use loser tree for merge */
+	if (state->useLoserTree && state->losers == NULL)
+	{
+		state->losers = (int16 *) MemoryContextAlloc(state->base.maincontext,
+													 state->nOutputTapes * sizeof(int16));
+		USEMEM(state, GetMemoryChunkSpace(state->losers));
+	}
+
 	/*
 	 * Use all the remaining memory we have available for tape buffers among
 	 * all the input tapes.  At the beginning of each merge pass, we will
@@ -2195,62 +2229,72 @@ mergeruns(Tuplesortstate *state)
 static void
 mergeonerun(Tuplesortstate *state)
 {
+	int         i;
 	int			srcTapeIndex;
 	LogicalTape *srcTape;
 
 	/*
 	 * Start the merge by loading one tuple from each active source tape into
-	 * the heap.
+	 * the heap or loser tree.
 	 */
 	beginmerge(state);
 
 	Assert(state->slabAllocatorUsed);
 
 	/*
-	 * Execute merge by repeatedly extracting lowest tuple in heap, writing it
-	 * out, and replacing it with next tuple from same tape (if there is
-	 * another one).
+	 * Execute merge by repeatedly extracting the tuple in the heap or loser
+	 * tree, writing it out, pulling next tuple from same tape (if there is
+	 * another one), and adjusting the heap or loser tree.
 	 */
 	while (state->memtupcount > 0)
 	{
 		SortTuple	stup;
 
 		/* write the tuple to destTape */
-		srcTapeIndex = state->memtuples[0].srctape;
+		i = (state->useLoserTree ? state->losers[0] : 0);
+		srcTapeIndex = state->memtuples[i].srctape;
 		srcTape = state->inputTapes[srcTapeIndex];
-		WRITETUP(state, state->destTape, &state->memtuples[0]);
+		WRITETUP(state, state->destTape, &state->memtuples[i]);
 
 		/* recycle the slot of the tuple we just wrote out, for the next read */
-		if (state->memtuples[0].tuple)
-			RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple);
+		if (state->memtuples[i].tuple)
+			RELEASE_SLAB_SLOT(state, state->memtuples[i].tuple);
 
-		/*
-		 * pull next tuple from the tape, and replace the written-out tuple in
-		 * the heap with it.
-		 */
+		/* pull next tuple from the tape, and adjust the heap or loser tree */
 		if (mergereadnext(state, srcTape, &stup))
 		{
 			stup.srctape = srcTapeIndex;
-			tuplesort_heap_replace_top(state, &stup);
+			if (state->useLoserTree)
+				state->memtuples[i] = stup;
+			else
+				tuplesort_heap_replace_top(state, &stup);
 		}
 		else
 		{
-			tuplesort_heap_delete_top(state);
+			if (state->useLoserTree)
+				state->memtuples[i].srctape = LOSER_TREE_EOF;
+			else
+				tuplesort_heap_delete_top(state);
 			state->nInputRuns--;
 		}
+
+		if (state->useLoserTree)
+		{
+			tuplesort_loser_tree_adjust(state);
+			if (state->losers[0] == LOSER_TREE_EOF)
+				state->memtupcount = 0;
+		}
 	}
 
-	/*
-	 * When the heap empties, we're done.  Write an end-of-run marker on the
-	 * output tape.
-	 */
+	/* Done. Write an end-of-run marker on the output tape. */
 	markrunend(state->destTape);
 }
 
 /*
  * beginmerge - initialize for a merge pass
  *
- * Fill the merge heap with the first tuple from each input tape.
+ * Pull the first tuple from each input tape, and build a heap
+ * or loser tree for the merge pass.
  */
 static void
 beginmerge(Tuplesortstate *state)
@@ -2276,7 +2320,10 @@ beginmerge(Tuplesortstate *state)
 		}
 	}
 
-	tuplesort_heap_build(state);
+	if (state->useLoserTree)
+		tuplesort_loser_tree_build(state);
+	else
+		tuplesort_heap_build(state);
 }
 
 /*
@@ -2826,6 +2873,95 @@ tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple)
 	memtuples[i] = *tuple;
 }
 
+/*
+ * Build a valid loser tree from memtuples[].
+ */
+static void
+tuplesort_loser_tree_build(Tuplesortstate *state)
+{
+	SortTuple  *memtuples = state->memtuples;
+	int         k = state->memtupcount;
+	int16       winners[MAXORDER];
+	int         i;
+
+	Assert(state->losers != NULL);
+	Assert(k <= MAXORDER);
+
+	/* this can happen? */
+	if (unlikely(k <= 0))
+		return;
+
+	/* 1-way merge, losers[0] is always 0 */
+	if (k == 1)
+	{
+		state->losers[0] = 0;
+		return;
+	}
+
+	/* fill the losers array */
+	for (i = k - 1; i > 0; i--)
+	{
+		int l = i * 2;
+		int r = l + 1;
+		int ll = (l >= k ? l - k : winners[l]);
+		int rr = (r >= k ? r - k : winners[r]);
+
+		if (COMPARETUP(state, &memtuples[ll], &memtuples[rr]) < 0)
+		{
+			winners[i] = ll;
+			state->losers[i] = rr;
+		}
+		else
+		{
+			winners[i] = rr;
+			state->losers[i] = ll;
+		}
+	}
+	state->losers[0] = winners[1];
+}
+
+/*
+ * Adjust the loser tree to get the new winner.  Caller must have already
+ * pulled the next tuple from the last winner's input tape, and placed it
+ * to memtuples[losers[0]].
+ */
+static void
+tuplesort_loser_tree_adjust(Tuplesortstate *state)
+{
+	SortTuple  *memtuples = state->memtuples;
+	int         k = state->memtupcount;
+	int         winner = state->losers[0];
+	int         i = (winner + k) / 2;
+
+	Assert(state->losers != NULL);
+	Assert(k > 0 && k <= MAXORDER);
+	Assert(winner >= 0 && winner < k);
+
+	/*
+	 * Reach end of run on the tape, set winner to LOSER_TREE_EOF so that
+	 * it's always a loser in comparison.
+	 */
+	if (memtuples[winner].srctape == LOSER_TREE_EOF)
+		winner = LOSER_TREE_EOF;
+
+	for (; i > 0; i /= 2)
+	{
+		int loser = state->losers[i];
+
+		/* LOSER_TREE_EOF is always a loser */
+		if (loser == LOSER_TREE_EOF)
+			continue;
+
+		if (winner == LOSER_TREE_EOF ||
+			COMPARETUP(state, &memtuples[winner], &memtuples[loser]) > 0)
+		{
+			state->losers[i] = winner;
+			winner = loser;
+		}
+	}
+	state->losers[0] = winner;
+}
+
 /*
  * Function to reverse the sort direction from its current state
  *
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index f21ec37da89..8c22eed716b 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -324,6 +324,7 @@ extern PGDLLIMPORT int tcp_user_timeout;
 extern PGDLLIMPORT char *role_string;
 extern PGDLLIMPORT bool in_hot_standby_guc;
 extern PGDLLIMPORT bool trace_sort;
+extern PGDLLIMPORT bool enable_loser_tree;
 
 #ifdef DEBUG_BOUNDED_SORT
 extern PGDLLIMPORT bool optimize_bounded_sort;
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 0411db832f1..b492e9de6fd 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -159,6 +159,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_incremental_sort        | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
+ enable_loser_tree              | off
  enable_material                | on
  enable_memoize                 | on
  enable_mergejoin               | on
@@ -173,7 +174,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(25 rows)
+(26 rows)
 
 -- There are always wait event descriptions for various types.  InjectionPoint
 -- may be present or absent, depending on history since last postmaster start.
-- 
2.52.0