> Ok, I reproduced that case, just not using a group by: by adding the group
> by a sort node is added in both cases (master and your patch), except that
> with your patch we sort on both keys and that doesn't really incur a
> performance penalty.
> 
> I think the overhead occurs because in the ExecAgg case, we use the
> tuplesort_*_datum API as an optimization when we have a single column as an
> input, which the ExecSort code doesn't. Maybe it would be worth it to try to
> use that API in sort nodes too, when it can be done.

Please find attached a POC patch to do just that.

The switch to the single-datum tuplesort is done when there is only one 
attribute, it is byval (to avoid having to deal with copy of the references 
everywhere) and we are not in bound mode (to also avoid having to move things 
around).

A naive run on make check pass on this, but I may have overlooked things.

Should I add this separately to the commitfest ?

For the record, the times I got on my laptop, on master VS david's patch VS 
both. Values are an average of 100 runs, as reported by pgbench --no-vacuum -f 
<file.sql> -t 100. There is a good amount of noise, but the simple "select one 
ordered column case" seems worth the optimization.

Only shared_buffers and work_mem have been set to 2GB each.

Setup 1: single table, 1 000 000 tuples, no index
CREATE TABLE tbench (
  a int,
  b int
);

INSERT INTO tbench (a, b) SELECT a, b FROM generate_series(1, 100) a, 
generate_series(1, 10000) b;


Test 1: Single-column ordered select (order by b since the table is already 
sorted by a)
select b from tbench order by b;

master: 303.661ms
with mine: 148.571ms

Test 2: Ordered sum (using b so that the input is not presorted)
select sum(b order by b) from tbench;

master: 112.379ms
with david's patch: 144.469ms
with david's patch + mine: 97ms

Test 3: Ordered sum + group by
select b, sum(a order by a) from tbench GROUP BY b;

master: 316.117ms
with david's patch: 297.079
with david's patch + mine: 294.601

Setup 2: same as before, but adding an index on (b, a)
CREATE INDEX ON tbench (b, a);

Test 2: Ordered sum:
select sum(a order by a) from tbench;

master: 111.847 ms
with david's patch: 48.088
with david's patch + mine: 47.678 ms

Test 3: Ordered sum + group by:
select a, sum(b order by b) from tbench GROUP BY a;

master: 76.873 ms
with david's patch: 61.105
with david's patch + mine:   62.672 ms


-- 
Ronan Dunklau
>From a833baf025f69762fb1f076bf1ef9c986dcbe824 Mon Sep 17 00:00:00 2001
From: Ronan Dunklau <ronan.dunk...@aiven.io>
Date: Mon, 5 Jul 2021 12:45:26 +0200
Subject: [PATCH] Allow Sort nodes to use the fast "single datum" tuplesort.

The sorting code in nodeagg made use of this API, but not the regular
nodesort one. This is a POC seeing how it can be done.
---
 src/backend/executor/nodeSort.c | 62 +++++++++++++++++++++++++--------
 src/include/nodes/execnodes.h   |  1 +
 2 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index b99027e0d7..03873b0d58 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -44,6 +44,7 @@ ExecSort(PlanState *pstate)
 	ScanDirection dir;
 	Tuplesortstate *tuplesortstate;
 	TupleTableSlot *slot;
+	Sort	   *plannode = (Sort *) node->ss.ps.plan;
 
 	CHECK_FOR_INTERRUPTS();
 
@@ -64,7 +65,6 @@ ExecSort(PlanState *pstate)
 
 	if (!node->sort_Done)
 	{
-		Sort	   *plannode = (Sort *) node->ss.ps.plan;
 		PlanState  *outerNode;
 		TupleDesc	tupDesc;
 
@@ -85,16 +85,29 @@ ExecSort(PlanState *pstate)
 
 		outerNode = outerPlanState(node);
 		tupDesc = ExecGetResultType(outerNode);
+		if ((tupDesc->natts == 1) &&
+			(TupleDescAttr(tupDesc, 0) ->attbyval) &&
+			!node->bounded)
+		{
 
-		tuplesortstate = tuplesort_begin_heap(tupDesc,
-											  plannode->numCols,
-											  plannode->sortColIdx,
-											  plannode->sortOperators,
-											  plannode->collations,
-											  plannode->nullsFirst,
-											  work_mem,
-											  NULL,
-											  node->randomAccess);
+			node->is_single_val = true;
+			tuplesortstate = tuplesort_begin_datum(TupleDescAttr(tupDesc, 0)->atttypid,
+												   plannode->sortOperators[0],
+												   plannode->collations[0],
+												   plannode->nullsFirst[0],
+												   work_mem,
+												   NULL,
+												   node->randomAccess);
+		} else
+			tuplesortstate = tuplesort_begin_heap(tupDesc,
+												  plannode->numCols,
+												  plannode->sortColIdx,
+												  plannode->sortOperators,
+												  plannode->collations,
+												  plannode->nullsFirst,
+												  work_mem,
+												  NULL,
+												  node->randomAccess);
 		if (node->bounded)
 			tuplesort_set_bound(tuplesortstate, node->bound);
 		node->tuplesortstate = (void *) tuplesortstate;
@@ -109,8 +122,15 @@ ExecSort(PlanState *pstate)
 
 			if (TupIsNull(slot))
 				break;
-
-			tuplesort_puttupleslot(tuplesortstate, slot);
+			if(node->is_single_val)
+			{
+				slot_getsomeattrs(slot, 1);
+				tuplesort_putdatum(tuplesortstate,
+								   slot->tts_values[0],
+								   slot->tts_isnull[0]);
+			} else {
+				tuplesort_puttupleslot(tuplesortstate, slot);
+			}
 		}
 
 		/*
@@ -150,9 +170,17 @@ ExecSort(PlanState *pstate)
 	 * next fetch from the tuplesort.
 	 */
 	slot = node->ss.ps.ps_ResultTupleSlot;
-	(void) tuplesort_gettupleslot(tuplesortstate,
-								  ScanDirectionIsForward(dir),
-								  false, slot, NULL);
+	if (node->is_single_val)
+	{
+		ExecClearTuple(slot);
+		if(tuplesort_getdatum(tuplesortstate, ScanDirectionIsForward(dir),
+						   &(slot->tts_values[0]), &(slot->tts_isnull[0]), NULL))
+			ExecStoreVirtualTuple(slot);
+	}
+	else
+		(void) tuplesort_gettupleslot(tuplesortstate,
+									  ScanDirectionIsForward(dir),
+									  false, slot, NULL);
 	return slot;
 }
 
@@ -167,6 +195,7 @@ SortState *
 ExecInitSort(Sort *node, EState *estate, int eflags)
 {
 	SortState  *sortstate;
+	TupleDesc out_tuple_desc;
 
 	SO1_printf("ExecInitSort: %s\n",
 			   "initializing sort node");
@@ -191,6 +220,7 @@ ExecInitSort(Sort *node, EState *estate, int eflags)
 	sortstate->bounded = false;
 	sortstate->sort_Done = false;
 	sortstate->tuplesortstate = NULL;
+	sortstate->is_single_val = false;
 
 	/*
 	 * Miscellaneous initialization
@@ -208,6 +238,8 @@ ExecInitSort(Sort *node, EState *estate, int eflags)
 	eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK);
 
 	outerPlanState(sortstate) = ExecInitNode(outerPlan(node), estate, eflags);
+	out_tuple_desc = outerPlanState(sortstate)->ps_ResultTupleDesc;
+
 
 	/*
 	 * Initialize scan slot and type.
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 0ec5509e7e..643f416c54 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2151,6 +2151,7 @@ typedef struct SortState
 	int64		bound_Done;		/* value of bound we did the sort with */
 	void	   *tuplesortstate; /* private state of tuplesort.c */
 	bool		am_worker;		/* are we a worker? */
+	bool		is_single_val;  /* are we using the single value optimization ? */
 	SharedSortInfo *shared_info;	/* one entry per worker */
 } SortState;
 
-- 
2.32.0

Reply via email to