We originally did this in 91e9e89dc, but a memory leak was discovered
as I neglected to pfree the datum which is freshly allocated in
tuplesort_getdatum. Because that was discovered late in the PG15
cycle, we opted to just disable the datum sort optimisation for byref
types in 3a5817695.
As was mentioned in [1], it looks like we could really use a version
of tuplesort_getdatum which does not palloc a new Datum. nodeSort.c,
when calling tuplesort_gettupleslot passes copy==false, so it would
make sense if the datum sort variation didn't do any copying either.
In the attached patch, I've added a function named
tuplesort_getdatum_nocopy() which is the same as tuplesort_getdatum()
only without the datumCopy(). I opted for the new function rather than
a new parameter in the existing function just to reduce branching and
additional needless overhead.
I also looked at the tuplesort_getdatum() call inside
process_ordered_aggregate_single() and made a few changes there so we
don't needlessly perform a datumCopy() when we skip a Datum due to
finding it the same as the previous Datum in a DISTINCT aggregate
situation.
I was also looking at mode_final(). Perhaps that could do with the
same treatment, I just didn't touch it in the attached patch.
A quick performance test with:
create table t1 (a varchar(32) not null, b varchar(32) not null);
insert into t1 select md5((x%10)::text),md5((x%10)::text) from
generate_Series(1,1000000)x;
vacuum freeze t1;
create index on t1(a);
Yields a small speedup for the DISTINCT aggregate case.
work_mem = 256MB
query = select max(distinct a), max(distinct b) from t1;
Master:
latency average = 313.197 ms
Patched:
latency average = 304.335 ms (about 3% faster)
The Datum sort in nodeSort.c is more impressive.
query = select b from t1 order by b offset 1000000;
Master:
latency average = 344.763 ms
Patched:
latency average = 268.374 ms (about 28% faster)
I'll add this to the November CF
David
[1]
https://www.postgresql.org/message-id/CAApHDvqS6wC5U==k9Hd26E4EQXH3QR67-T4=q1rq36ngvjf...@mail.gmail.com
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index fe74e49814..b5f63b5a2b 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -878,8 +878,8 @@ process_ordered_aggregate_single(AggState *aggstate,
* pfree them when they are no longer needed.
*/
- while (tuplesort_getdatum(pertrans->sortstates[aggstate->current_set],
- true, newVal, isNull,
&newAbbrevVal))
+ while
(tuplesort_getdatum_nocopy(pertrans->sortstates[aggstate->current_set],
+ true,
newVal, isNull, &newAbbrevVal))
{
/*
* Clear and select the working context for evaluation of the
equality
@@ -900,24 +900,33 @@ process_ordered_aggregate_single(AggState *aggstate,
pertrans->aggCollation,
oldVal, *newVal)))))
{
- /* equal to prior, so forget this one */
- if (!pertrans->inputtypeByVal && !*isNull)
- pfree(DatumGetPointer(*newVal));
+ MemoryContextSwitchTo(oldContext);
+ continue;
}
else
{
advance_transition_function(aggstate, pertrans,
pergroupstate);
- /* forget the old value, if any */
- if (!oldIsNull && !pertrans->inputtypeByVal)
- pfree(DatumGetPointer(oldVal));
- /* and remember the new one for subsequent equality
checks */
- oldVal = *newVal;
+
+ MemoryContextSwitchTo(oldContext);
+
+ /*
+ * Forget the old value, if any and remember the new
one for
+ * subsequent equality checks.
+ */
+ if (!pertrans->inputtypeByVal)
+ {
+ if (!oldIsNull)
+ pfree(DatumGetPointer(oldVal));
+ if (!*isNull)
+ oldVal = datumCopy(*newVal,
pertrans->inputtypeByVal,
+
pertrans->inputtypeLen);
+ }
+ else
+ oldVal = *newVal;
oldAbbrevVal = newAbbrevVal;
oldIsNull = *isNull;
haveOldVal = true;
}
-
- MemoryContextSwitchTo(oldContext);
}
if (!oldIsNull && !pertrans->inputtypeByVal)
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index 37ad35704b..f418be30a1 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -197,8 +197,8 @@ ExecSort(PlanState *pstate)
if (node->datumSort)
{
ExecClearTuple(slot);
- if (tuplesort_getdatum(tuplesortstate,
ScanDirectionIsForward(dir),
-
&(slot->tts_values[0]), &(slot->tts_isnull[0]), NULL))
+ if (tuplesort_getdatum_nocopy(tuplesortstate,
ScanDirectionIsForward(dir),
+
&(slot->tts_values[0]), &(slot->tts_isnull[0]), NULL))
ExecStoreVirtualTuple(slot);
}
else
@@ -278,10 +278,10 @@ ExecInitSort(Sort *node, EState *estate, int eflags)
outerTupDesc = ExecGetResultType(outerPlanState(sortstate));
/*
- * We perform a Datum sort when we're sorting just a single byval
column,
+ * We perform a Datum sort when we're sorting just a single column,
* otherwise we perform a tuple sort.
*/
- if (outerTupDesc->natts == 1 && TupleDescAttr(outerTupDesc,
0)->attbyval)
+ if (outerTupDesc->natts == 1)
sortstate->datumSort = true;
else
sortstate->datumSort = false;
diff --git a/src/backend/utils/sort/tuplesortvariants.c
b/src/backend/utils/sort/tuplesortvariants.c
index afa5bdbf04..a9d51234e6 100644
--- a/src/backend/utils/sort/tuplesortvariants.c
+++ b/src/backend/utils/sort/tuplesortvariants.c
@@ -886,6 +886,58 @@ tuplesort_getdatum(Tuplesortstate *state, bool forward,
return true;
}
+/*
+ * Fetch the next Datum in either forward or back direction.
+ * Returns false if no more datums.
+ *
+ * If the Datum is pass-by-ref type, the returned value is a pointer directly
+ * into the tuplesort's stup.tuple. This is only safe for callers that no
+ * longer require a reference to the datum beyond any subsequent manipulation
+ * of the tuplesort's state.
+ *
+ * Caller may optionally be passed back abbreviated value (on true return
+ * value) when abbreviation was used, which can be used to cheaply avoid
+ * equality checks that might otherwise be required. Caller can safely make a
+ * determination of "non-equal tuple" based on simple binary inequality. A
+ * NULL value will have a zeroed abbreviated value representation, which caller
+ * may rely on in abbreviated inequality check.
+ */
+bool
+tuplesort_getdatum_nocopy(Tuplesortstate *state, bool forward,
+ Datum *val, bool *isNull,
Datum *abbrev)
+{
+ TuplesortPublic *base = TuplesortstateGetPublic(state);
+ MemoryContext oldcontext = MemoryContextSwitchTo(base->sortcontext);
+ SortTuple stup;
+
+ if (!tuplesort_gettuple_common(state, forward, &stup))
+ {
+ MemoryContextSwitchTo(oldcontext);
+ return false;
+ }
+
+ /* Ensure we copy into caller's memory context */
+ MemoryContextSwitchTo(oldcontext);
+
+ /* Record abbreviated key for caller */
+ if (base->sortKeys->abbrev_converter && abbrev)
+ *abbrev = stup.datum1;
+
+ if (stup.isnull1 || !base->tuples)
+ {
+ *val = stup.datum1;
+ *isNull = stup.isnull1;
+ }
+ else
+ {
+ /* use stup.tuple because stup.datum1 may be an abbreviation */
+ *val = PointerGetDatum(stup.tuple);
+ *isNull = false;
+ }
+
+ return true;
+}
+
/*
* Routines specialized for HeapTuple (actually MinimalTuple) case
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index 4441274990..fc8fc780eb 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -441,6 +441,7 @@ extern HeapTuple tuplesort_getheaptuple(Tuplesortstate
*state, bool forward);
extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward);
extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
Datum *val, bool
*isNull, Datum *abbrev);
-
+extern bool tuplesort_getdatum_nocopy(Tuplesortstate *state, bool forward,
+ Datum
*val, bool *isNull, Datum *abbrev);
#endif /* TUPLESORT_H */