On Thu, 29 Sept 2022 at 12:07, Peter Geoghegan <[email protected]> wrote:
> Also potentially relevant: the 2017 commit fa117ee4 anticipated adding
> a "copy" argument to tuplesort_getdatum() (the same commit added such
> a "copy" argument to tuplesort_gettupleslot()). I see that that still
> hasn't happened to tuplesort_getdatum() all these years later. Might
> be a good idea to do it in the next year or two, though.
>
> If David is interested in pursuing this now then I certainly won't object.
Just while this is fresh in my head, I wrote some code to make this
happen. My preference would be not to add the "copy" param to the
existing function and instead just add a new function to prevent
additional branching.
The attached puts back the datum sort in nodeSort.c for byref types
and adjusts process_ordered_aggregate_single() to make use of this
function.
I did a quick benchmark to see if this help DISTINCT aggregate any:
create table t1 (a varchar(32) not null, b varchar(32) not null);
insert into t1 select md5((x%10)::text),md5((x%10)::text) from
generate_Series(1,1000000)x;
vacuum freeze t1;
create index on t1(a);
With a work_mem of 256MBs I get:
query = select max(distinct a), max(distinct b) from t1;
Master:
latency average = 313.197 ms
Patched:
latency average = 304.335 ms
So not a very impressive speedup there (about 3%)
Some excerpts from perf top show:
Master:
1.40% postgres [.] palloc
1.13% postgres [.] tuplesort_getdatum
0.77% postgres [.] datumCopy
Patched:
0.91% postgres [.] tuplesort_getdatum_nocopy
0.65% postgres [.] palloc
I stared for a while at the mode_final() function and thought maybe we
could use the nocopy variant there. I just didn't quite pluck up the
motivation to write any code to see if it could be made faster.
David
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index fe74e49814..b5f63b5a2b 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -878,8 +878,8 @@ process_ordered_aggregate_single(AggState *aggstate,
* pfree them when they are no longer needed.
*/
- while (tuplesort_getdatum(pertrans->sortstates[aggstate->current_set],
- true, newVal, isNull,
&newAbbrevVal))
+ while
(tuplesort_getdatum_nocopy(pertrans->sortstates[aggstate->current_set],
+ true,
newVal, isNull, &newAbbrevVal))
{
/*
* Clear and select the working context for evaluation of the
equality
@@ -900,24 +900,33 @@ process_ordered_aggregate_single(AggState *aggstate,
pertrans->aggCollation,
oldVal, *newVal)))))
{
- /* equal to prior, so forget this one */
- if (!pertrans->inputtypeByVal && !*isNull)
- pfree(DatumGetPointer(*newVal));
+ MemoryContextSwitchTo(oldContext);
+ continue;
}
else
{
advance_transition_function(aggstate, pertrans,
pergroupstate);
- /* forget the old value, if any */
- if (!oldIsNull && !pertrans->inputtypeByVal)
- pfree(DatumGetPointer(oldVal));
- /* and remember the new one for subsequent equality
checks */
- oldVal = *newVal;
+
+ MemoryContextSwitchTo(oldContext);
+
+ /*
+ * Forget the old value, if any and remember the new
one for
+ * subsequent equality checks.
+ */
+ if (!pertrans->inputtypeByVal)
+ {
+ if (!oldIsNull)
+ pfree(DatumGetPointer(oldVal));
+ if (!*isNull)
+ oldVal = datumCopy(*newVal,
pertrans->inputtypeByVal,
+
pertrans->inputtypeLen);
+ }
+ else
+ oldVal = *newVal;
oldAbbrevVal = newAbbrevVal;
oldIsNull = *isNull;
haveOldVal = true;
}
-
- MemoryContextSwitchTo(oldContext);
}
if (!oldIsNull && !pertrans->inputtypeByVal)
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index 37ad35704b..f418be30a1 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -197,8 +197,8 @@ ExecSort(PlanState *pstate)
if (node->datumSort)
{
ExecClearTuple(slot);
- if (tuplesort_getdatum(tuplesortstate,
ScanDirectionIsForward(dir),
-
&(slot->tts_values[0]), &(slot->tts_isnull[0]), NULL))
+ if (tuplesort_getdatum_nocopy(tuplesortstate,
ScanDirectionIsForward(dir),
+
&(slot->tts_values[0]), &(slot->tts_isnull[0]), NULL))
ExecStoreVirtualTuple(slot);
}
else
@@ -278,10 +278,10 @@ ExecInitSort(Sort *node, EState *estate, int eflags)
outerTupDesc = ExecGetResultType(outerPlanState(sortstate));
/*
- * We perform a Datum sort when we're sorting just a single byval
column,
+ * We perform a Datum sort when we're sorting just a single column,
* otherwise we perform a tuple sort.
*/
- if (outerTupDesc->natts == 1 && TupleDescAttr(outerTupDesc,
0)->attbyval)
+ if (outerTupDesc->natts == 1)
sortstate->datumSort = true;
else
sortstate->datumSort = false;
diff --git a/src/backend/utils/sort/tuplesortvariants.c
b/src/backend/utils/sort/tuplesortvariants.c
index afa5bdbf04..a9d51234e6 100644
--- a/src/backend/utils/sort/tuplesortvariants.c
+++ b/src/backend/utils/sort/tuplesortvariants.c
@@ -886,6 +886,58 @@ tuplesort_getdatum(Tuplesortstate *state, bool forward,
return true;
}
+/*
+ * Fetch the next Datum in either forward or back direction.
+ * Returns false if no more datums.
+ *
+ * If the Datum is pass-by-ref type, the returned value is a pointer directly
+ * into the tuplesort's stup.tuple. This is only safe for callers that no
+ * longer require a reference to the datum beyond any subsequent manipulation
+ * of the tuplesort's state.
+ *
+ * Caller may optionally be passed back abbreviated value (on true return
+ * value) when abbreviation was used, which can be used to cheaply avoid
+ * equality checks that might otherwise be required. Caller can safely make a
+ * determination of "non-equal tuple" based on simple binary inequality. A
+ * NULL value will have a zeroed abbreviated value representation, which caller
+ * may rely on in abbreviated inequality check.
+ */
+bool
+tuplesort_getdatum_nocopy(Tuplesortstate *state, bool forward,
+ Datum *val, bool *isNull,
Datum *abbrev)
+{
+ TuplesortPublic *base = TuplesortstateGetPublic(state);
+ MemoryContext oldcontext = MemoryContextSwitchTo(base->sortcontext);
+ SortTuple stup;
+
+ if (!tuplesort_gettuple_common(state, forward, &stup))
+ {
+ MemoryContextSwitchTo(oldcontext);
+ return false;
+ }
+
+ /* Ensure we copy into caller's memory context */
+ MemoryContextSwitchTo(oldcontext);
+
+ /* Record abbreviated key for caller */
+ if (base->sortKeys->abbrev_converter && abbrev)
+ *abbrev = stup.datum1;
+
+ if (stup.isnull1 || !base->tuples)
+ {
+ *val = stup.datum1;
+ *isNull = stup.isnull1;
+ }
+ else
+ {
+ /* use stup.tuple because stup.datum1 may be an abbreviation */
+ *val = PointerGetDatum(stup.tuple);
+ *isNull = false;
+ }
+
+ return true;
+}
+
/*
* Routines specialized for HeapTuple (actually MinimalTuple) case
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index 4441274990..fc8fc780eb 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -441,6 +441,7 @@ extern HeapTuple tuplesort_getheaptuple(Tuplesortstate
*state, bool forward);
extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward);
extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
Datum *val, bool
*isNull, Datum *abbrev);
-
+extern bool tuplesort_getdatum_nocopy(Tuplesortstate *state, bool forward,
+ Datum
*val, bool *isNull, Datum *abbrev);
#endif /* TUPLESORT_H */