On Thu, 16 Feb 2023 at 00:45, John Naylor <john.nay...@enterprisedb.com> wrote: > Okay, here's a rerun including the sort hack, and it looks like incremental > sort is only ahead with the smallest set, otherwise same or maybe slightly > slower: > > HEAD: > > 4 ^ 8: latency average = 113.461 ms > 5 ^ 8: latency average = 786.080 ms > 6 ^ 8: latency average = 3948.388 ms > 7 ^ 8: latency average = 15733.348 ms > > tiebreaker: > > 4 ^ 8: latency average = 106.556 ms > 5 ^ 8: latency average = 734.834 ms > 6 ^ 8: latency average = 3640.507 ms > 7 ^ 8: latency average = 14470.199 ms > > tiebreaker + incr sort hack: > > 4 ^ 8: latency average = 93.998 ms > 5 ^ 8: latency average = 740.120 ms > 6 ^ 8: latency average = 3715.942 ms > 7 ^ 8: latency average = 14749.323 ms
Sad news :( the sort hacks are still quite a bit faster for 4 ^ 8. I was fooling around with the attached (very quickly and crudely put together) patch just there. The idea is to sort during tuplesort_puttuple_common() when the memory consumption goes over some approximation of L3 cache size in the hope we still have cache lines for the tuples in question still. The code is not ideal there as we track availMem rather than the used mem, so maybe I need to do that better as we could cross some boundary without actually having done very much, plus we USEMEM for other reasons too. I found that the patch didn't really help: create table t (a int not null, b int not null); insert into t select (x*random())::int % 100,(x*random())::int % 100 from generate_Series(1,10000000)x; vacuum freeze t; select pg_prewarm('t'); show work_mem; work_mem ---------- 4GB explain (analyze, timing off) select * from t order by a,b; master: Execution Time: 5620.704 ms Execution Time: 5506.705 ms patched: Execution Time: 6801.421 ms Execution Time: 6762.130 ms I suspect it's slower because the final sort must sort the entire array still without knowledge that portions of it are pre-sorted. It would be very interesting to improve this and do some additional work and keep track of the "memtupsortedto" index by pushing them onto a List each time we cross the availMem boundary, then do then qsort just the final portion of the array in tuplesort_performsort() before doing a k-way merge on each segment rather than qsorting the entire thing again. I suspect this would be faster when work_mem exceeds L3 by some large amount. David
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 9ca9835aab..65ffe442cd 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -216,6 +216,7 @@ struct Tuplesortstate SortTuple *memtuples; /* array of SortTuple structs */ int memtupcount; /* number of tuples currently present */ int memtupsize; /* allocated length of memtuples array */ + int memtupsortedto; /* index of where we've already sorted up to */ bool growmemtuples; /* memtuples' growth still underway? */ /* @@ -465,7 +466,7 @@ static bool mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple static void dumptuples(Tuplesortstate *state, bool alltuples); static void make_bounded_heap(Tuplesortstate *state); static void sort_bounded_heap(Tuplesortstate *state); -static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state, int start_index); static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple); static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple); static void tuplesort_heap_delete_top(Tuplesortstate *state); @@ -708,6 +709,7 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt) */ state->memtupsize = INITIAL_MEMTUPSIZE; state->memtuples = NULL; + state->memtupsortedto = 0; /* * After all of the other non-parallel-related state, we setup all of the @@ -793,6 +795,7 @@ tuplesort_begin_batch(Tuplesortstate *state) state->tapeset = NULL; state->memtupcount = 0; + state->memtupsortedto = 0; /* * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; @@ -1193,10 +1196,29 @@ tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple, bool useAbbre Assert(!LEADER(state)); +/* just roughly. Must be power-of-2 for efficient division */ +#define EFFECTIVE_CPU_L3_SIZE 16777216 + /* Count the size of the out-of-line data */ if (tuple->tuple != NULL) + { + size_t before_mem = state->availMem / EFFECTIVE_CPU_L3_SIZE; + USEMEM(state, GetMemoryChunkSpace(tuple->tuple)); + /* + * Being a bit more cache awareness to the sort and do a presort + * of the tuple seen thus far while they're likely still sitting in + * L3. + */ + if (before_mem != state->availMem / EFFECTIVE_CPU_L3_SIZE) + { + tuplesort_sort_memtuples(state, state->memtupsortedto); + state->memtupsortedto = state->memtupcount; + } + } + + if (!useAbbrev) { /* @@ -1403,7 +1425,7 @@ tuplesort_performsort(Tuplesortstate *state) if (SERIAL(state)) { /* Just qsort 'em and we're done */ - tuplesort_sort_memtuples(state); + tuplesort_sort_memtuples(state, 0); state->status = TSS_SORTEDINMEM; } else if (WORKER(state)) @@ -2388,7 +2410,7 @@ dumptuples(Tuplesortstate *state, bool alltuples) * Sort all tuples accumulated within the allowed amount of memory for * this run using quicksort */ - tuplesort_sort_memtuples(state); + tuplesort_sort_memtuples(state, 0); #ifdef TRACE_SORT if (trace_sort) @@ -2413,6 +2435,7 @@ dumptuples(Tuplesortstate *state, bool alltuples) } state->memtupcount = 0; + state->memtupsortedto = 0; /* * Reset tuple memory. We've freed all of the tuples that we previously @@ -2636,6 +2659,8 @@ make_bounded_heap(Tuplesortstate *state) reversedirection(state); state->memtupcount = 0; /* make the heap empty */ + state->memtupsortedto = 0; + for (i = 0; i < tupcount; i++) { if (state->memtupcount < state->bound) @@ -2711,11 +2736,11 @@ sort_bounded_heap(Tuplesortstate *state) * Quicksort is used for small in-memory sorts, and external sort runs. */ static void -tuplesort_sort_memtuples(Tuplesortstate *state) +tuplesort_sort_memtuples(Tuplesortstate *state, int start_index) { Assert(!LEADER(state)); - if (state->memtupcount > 1) + if (state->memtupcount - start_index > 1) { /* * Do we have the leading column's value or abbreviation in datum1, @@ -2725,24 +2750,24 @@ tuplesort_sort_memtuples(Tuplesortstate *state) { if (state->base.sortKeys[0].comparator == ssup_datum_unsigned_cmp) { - qsort_tuple_unsigned(state->memtuples, - state->memtupcount, + qsort_tuple_unsigned(&state->memtuples[start_index], + state->memtupcount - start_index, state); return; } #if SIZEOF_DATUM >= 8 else if (state->base.sortKeys[0].comparator == ssup_datum_signed_cmp) { - qsort_tuple_signed(state->memtuples, - state->memtupcount, + qsort_tuple_signed(&state->memtuples[start_index], + state->memtupcount - start_index, state); return; } #endif else if (state->base.sortKeys[0].comparator == ssup_datum_int32_cmp) { - qsort_tuple_int32(state->memtuples, - state->memtupcount, + qsort_tuple_int32(&state->memtuples[start_index], + state->memtupcount - start_index, state); return; } @@ -2751,13 +2776,14 @@ tuplesort_sort_memtuples(Tuplesortstate *state) /* Can we use the single-key sort function? */ if (state->base.onlyKey != NULL) { - qsort_ssup(state->memtuples, state->memtupcount, + qsort_ssup(&state->memtuples[start_index], + state->memtupcount - start_index, state->base.onlyKey); } else { - qsort_tuple(state->memtuples, - state->memtupcount, + qsort_tuple(&state->memtuples[start_index], + state->memtupcount - start_index, state->base.comparetup, state); }