Attached is a very rough and limited proof of concept of $subject when
tuplesort uses abbreviated keys. It only works with int64:

Demo:

--setup
drop table if exists test;
create table test (a bigint);
insert into test select (1_000_000_000 * random())::bigint from
generate_series(1,1_000_000,1) i;
vacuum freeze test;
create extension if not exists pg_prewarm ;
select pg_prewarm('test');

set work_mem = '64MB';
\timing

select * from test order by a offset 1_000_000_000;
Time: 238.925 ms

set debug_branchless_sort = 'on'; select * from test order by a offset
1_000_000_000;
Time: 202.977 ms

With the patch, this case is probably held back by our simple
insertion sort, which I haven't changed.

The algorithm with pythonic pseudocode is from [1]. There is also Rust
code under a permissive license by the same authors elsewhere.

To evaluate this technique further, it'll need some work to handle
duplicates well. Then we can run tests on various input distributions.
If that's promising, we'll need to proceed with an idea I had years
ago to separate null/not-null into separate arrays as a prerequisite.
Then, we can probably simplify tuplesort.c some, including removing
each separate template invocation for integer-like keys. I'm also not
using the sort template in the PoC, to simplify development, and it's
not clear this work can be folded back into there without adding a lot
of complexity.

[1] https://orlp.net/blog/branchless-lomuto-partitioning/

-- 
John Naylor
Amazon Web Services
From 44d407ff0e3485676d435ec5cd54898b44f439ff Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Mon, 25 Nov 2024 12:37:33 +0700
Subject: [PATCH v1] Branchless Lomuto partitioning

POC that only works on datatypes that use ssup_datum_signed_cmp.
Dev-only GUC needed because it can't yet handle NULLs or DESC.
---
 src/backend/utils/misc/guc_tables.c           |  12 ++
 .../utils/sort/part_right_branchless.h        |  21 +++
 src/backend/utils/sort/tuplesort.c            | 150 +++++++++++++++++-
 src/include/utils/guc.h                       |   1 +
 src/include/utils/tuplesort.h                 |   3 +
 5 files changed, 184 insertions(+), 3 deletions(-)
 create mode 100644 src/backend/utils/sort/part_right_branchless.h

diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 8a67f01200..9d69890d72 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -1709,6 +1709,18 @@ struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		/* XXX not for commit */
+		{"debug_branchless_sort", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("WIP testing of branchless sort techniques"),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&debug_branchless_sort,
+		false,
+		NULL, NULL, NULL
+	},
+
 #ifdef TRACE_SYNCSCAN
 	/* this is undocumented because not exposed in a standard build */
 	{
diff --git a/src/backend/utils/sort/part_right_branchless.h b/src/backend/utils/sort/part_right_branchless.h
new file mode 100644
index 0000000000..37d93bf45f
--- /dev/null
+++ b/src/backend/utils/sort/part_right_branchless.h
@@ -0,0 +1,21 @@
+/*
+ * Based on psuedocode from https://orlp.net/blog/branchless-lomuto-partitioning/
+ *
+ * There is deliberately no include guard here.
+ */
+
+SortTuple pivot = v[0];
+size_t i = 0;
+size_t j = 0;
+
+while (j < n - 1)
+{
+	v[j] = v[i];
+	j += 1;
+	v[i] = v[j];
+	i += CMP_2WAY(v[i], pivot);
+}
+
+v[j] = v[i];
+v[i] = pivot;
+return &v[i];
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index c960cfa823..f71defe633 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -122,6 +122,7 @@
 
 /* GUC variables */
 bool		trace_sort = false;
+bool		debug_branchless_sort = false;	/* XXX not for commit */
 
 #ifdef DEBUG_BOUNDED_SORT
 bool		optimize_bounded_sort = true;
@@ -619,6 +620,134 @@ qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
 #define ST_DEFINE
 #include "lib/sort_template.h"
 
+
+/* WIP: just a demo: sort tuples with nulls in the first key should be put in a separate array */
+#define SS_COMPARE(a, b) \
+	ApplySortComparator((a)->datum1, false, \
+						(b)->datum1, false, ssup)
+
+/* XXX: only works on first sort key */
+static pg_noinline SortTuple *
+med3(SortTuple *a,
+	 SortTuple *b,
+	 SortTuple *c, SortSupport ssup)
+{
+	return SS_COMPARE(a, b) < 0 ?
+		(SS_COMPARE(b, c) < 0 ? b : (SS_COMPARE(a, c) < 0 ? c : a))
+		: (SS_COMPARE(b, c) > 0 ? b : (SS_COMPARE(a, c) < 0 ? a : c));
+}
+
+static SortTuple *
+part_right_signed_asc(SortTuple *v, size_t n)
+{
+#define CMP_2WAY(v, pivot) DatumGetInt64((v).datum1) < DatumGetInt64((pivot).datum1)
+#include "part_right_branchless.h"
+#undef CMP_2WAY
+}
+
+static inline void
+swap(SortTuple *a, SortTuple *b)
+{
+	SortTuple	tmp = *a;
+
+	*a = *b;
+	*b = tmp;
+}
+
+/* WIP: proof of concept for one data type, with no tiebreaks or NULL handling */
+static void
+qsort_ssup_branchless(SortTuple *data, size_t n, Tuplesortstate *state)
+{
+	SortTuple  *a = data,
+			   *pl,
+			   *pm,
+			   *pn,
+			   *pivot_pos;
+	int			presorted;
+	SortSupportData *ssup = state->base.sortKeys;
+
+
+loop:
+	if (n < 2)
+		return;
+
+	CHECK_FOR_INTERRUPTS();
+
+	if (n < 7)
+	{
+		/* TODO: we should use moves rather than swaps */
+		/*
+		 * TODO: fastpath on front slice using ping-pong merge or sorting
+		 * network
+		 */
+		/* WIP: needs to be aware of tiebreak comparators */
+		for (pm = a + 1; pm < a + n;
+			 pm += 1)
+			for (pl = pm; pl > a && SS_COMPARE(pl - 1, pl) > 0;
+				 pl -= 1)
+				swap(pl, pl - 1);
+		return;
+	}
+
+	/*
+	 * TODO: Branchless Lomuto partitioning is said to not work well with
+	 * descending inputs, so add a similar check for descending input. We
+	 * could probably hoist these out of the loop and only check once at the
+	 * beginning.
+	 */
+	presorted = 1;
+	for (pm = a + 1; pm < a + n;
+		 pm += 1)
+	{
+		CHECK_FOR_INTERRUPTS();
+		if (SS_COMPARE(pm - 1, pm) > 0)
+		{
+			presorted = 0;
+			break;
+		}
+	}
+	if (presorted)
+		return;
+
+	pm = a + (n / 2);
+	if (n > 7)
+	{
+		pl = a;
+		pn = a + (n - 1);
+		if (n > 40)
+		{
+			size_t		d = (n / 8);
+
+			pl = med3(pl, pl + d, pl + 2 * d, ssup);
+			pm = med3(pm - d, pm, pm + d, ssup);
+			pn = med3(pn - 2 * d, pn - d, pn, ssup);
+		}
+		pm = med3(pl, pm, pn, ssup);
+	}
+
+	/* place pivot at the front */
+	swap(a, pm);
+
+	/*
+	 * WIP: check if pivot is same as previous pivot and do "partition left"
+	 * to handle duplicates. If the abbrev key is not authoritative, we need
+	 * to recurse with a sort using the tiebreak comparator
+	 */
+
+	pivot_pos = state->base.partition_right(a, n);
+
+	/* WIP: Keep recursion simple for now. */
+
+	/* Recurse on left partition... */
+	qsort_ssup_branchless(a, pivot_pos - a, state);
+
+	/* ..., then iterate on right partition to save stack space */
+	n -= pivot_pos + 1 - a;
+	a = pivot_pos + 1;
+	goto loop;
+}
+
+
 /*
  *		tuplesort_begin_xxx
  *
@@ -2695,9 +2824,24 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
 #if SIZEOF_DATUM >= 8
 			else if (state->base.sortKeys[0].comparator == ssup_datum_signed_cmp)
 			{
-				qsort_tuple_signed(state->memtuples,
-								   state->memtupcount,
-								   state);
+				SortTuple  *pm;
+
+				if (debug_branchless_sort)
+				{
+					state->base.partition_right = part_right_signed_asc;
+					qsort_ssup_branchless(state->memtuples,
+										  state->memtupcount,
+										  state);
+				}
+				else
+					qsort_tuple_signed(state->memtuples,
+									   state->memtupcount,
+									   state);
+
+				for (pm = state->memtuples + 1;
+					 pm < state->memtuples + state->memtupcount;
+					 pm++)
+					Assert(COMPARETUP(state, pm - 1, pm) <= 0);
 				return;
 			}
 #endif
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 840b0fe57f..33fe01c573 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -296,6 +296,7 @@ extern PGDLLIMPORT int tcp_user_timeout;
 extern PGDLLIMPORT char *role_string;
 extern PGDLLIMPORT bool in_hot_standby_guc;
 extern PGDLLIMPORT bool trace_sort;
+extern PGDLLIMPORT bool debug_branchless_sort; /* XXX not for commit */
 
 #ifdef DEBUG_BOUNDED_SORT
 extern PGDLLIMPORT bool optimize_bounded_sort;
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index cde83f6201..936f21b0db 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -180,6 +180,9 @@ typedef struct
 	 */
 	SortTupleComparator comparetup_tiebreak;
 
+	/* Optional partition functions for single-instruction comparators */
+	SortTuple*		(*partition_right) (SortTuple *begin, size_t n);
+
 	/*
 	 * Alter datum1 representation in the SortTuple's array back from the
 	 * abbreviated key to the first column value.
-- 
2.47.0

Reply via email to