On Thu, 16 Feb 2023 at 00:45, John Naylor <john.nay...@enterprisedb.com> wrote:
> Okay, here's a rerun including the sort hack, and it looks like incremental 
> sort is only ahead with the smallest set, otherwise same or maybe slightly 
> slower:
>
> HEAD:
>
> 4 ^ 8: latency average = 113.461 ms
> 5 ^ 8: latency average = 786.080 ms
> 6 ^ 8: latency average = 3948.388 ms
> 7 ^ 8: latency average = 15733.348 ms
>
> tiebreaker:
>
> 4 ^ 8: latency average = 106.556 ms
> 5 ^ 8: latency average = 734.834 ms
> 6 ^ 8: latency average = 3640.507 ms
> 7 ^ 8: latency average = 14470.199 ms
>
> tiebreaker + incr sort hack:
>
> 4 ^ 8: latency average = 93.998 ms
> 5 ^ 8: latency average = 740.120 ms
> 6 ^ 8: latency average = 3715.942 ms
> 7 ^ 8: latency average = 14749.323 ms

Sad news :(  the sort hacks are still quite a bit faster for 4 ^ 8.

I was fooling around with the attached (very quickly and crudely put
together) patch just there. The idea is to sort during
tuplesort_puttuple_common() when the memory consumption goes over some
approximation of L3 cache size in the hope we still have cache lines
for the tuples in question still.   The code is not ideal there as we
track availMem rather than the used mem, so maybe I need to do that
better as we could cross some boundary without actually having done
very much, plus we USEMEM for other reasons too.

I found that the patch didn't really help:

create table t (a int not null, b int not null);
insert into t select (x*random())::int % 100,(x*random())::int % 100
from generate_Series(1,10000000)x;
vacuum freeze t;
select pg_prewarm('t');
show work_mem;
 work_mem
----------
 4GB

explain (analyze, timing off) select * from t order by a,b;

master:
Execution Time: 5620.704 ms
Execution Time: 5506.705 ms

patched:
Execution Time: 6801.421 ms
Execution Time: 6762.130 ms

I suspect it's slower because the final sort must sort the entire
array still without knowledge that portions of it are pre-sorted.  It
would be very interesting to improve this and do some additional work
and keep track of the "memtupsortedto" index by pushing them onto a
List each time we cross the availMem boundary, then do then qsort just
the final portion of the array in tuplesort_performsort() before doing
a k-way merge on each segment rather than qsorting the entire thing
again. I suspect this would be faster when work_mem exceeds L3 by some
large amount.

David
diff --git a/src/backend/utils/sort/tuplesort.c 
b/src/backend/utils/sort/tuplesort.c
index 9ca9835aab..65ffe442cd 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -216,6 +216,7 @@ struct Tuplesortstate
        SortTuple  *memtuples;          /* array of SortTuple structs */
        int                     memtupcount;    /* number of tuples currently 
present */
        int                     memtupsize;             /* allocated length of 
memtuples array */
+       int                     memtupsortedto; /* index of where we've already 
sorted up to */
        bool            growmemtuples;  /* memtuples' growth still underway? */
 
        /*
@@ -465,7 +466,7 @@ static bool mergereadnext(Tuplesortstate *state, 
LogicalTape *srcTape, SortTuple
 static void dumptuples(Tuplesortstate *state, bool alltuples);
 static void make_bounded_heap(Tuplesortstate *state);
 static void sort_bounded_heap(Tuplesortstate *state);
-static void tuplesort_sort_memtuples(Tuplesortstate *state);
+static void tuplesort_sort_memtuples(Tuplesortstate *state, int start_index);
 static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple);
 static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple 
*tuple);
 static void tuplesort_heap_delete_top(Tuplesortstate *state);
@@ -708,6 +709,7 @@ tuplesort_begin_common(int workMem, SortCoordinate 
coordinate, int sortopt)
         */
        state->memtupsize = INITIAL_MEMTUPSIZE;
        state->memtuples = NULL;
+       state->memtupsortedto = 0;
 
        /*
         * After all of the other non-parallel-related state, we setup all of 
the
@@ -793,6 +795,7 @@ tuplesort_begin_batch(Tuplesortstate *state)
        state->tapeset = NULL;
 
        state->memtupcount = 0;
+       state->memtupsortedto = 0;
 
        /*
         * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
@@ -1193,10 +1196,29 @@ tuplesort_puttuple_common(Tuplesortstate *state, 
SortTuple *tuple, bool useAbbre
 
        Assert(!LEADER(state));
 
+/* just roughly. Must be power-of-2 for efficient division */
+#define EFFECTIVE_CPU_L3_SIZE 16777216
+
        /* Count the size of the out-of-line data */
        if (tuple->tuple != NULL)
+       {
+               size_t before_mem = state->availMem / EFFECTIVE_CPU_L3_SIZE;
+
                USEMEM(state, GetMemoryChunkSpace(tuple->tuple));
 
+               /*
+                * Being a bit more cache awareness to the sort and do a presort
+                * of the tuple seen thus far while they're likely still 
sitting in
+                * L3.
+                */
+               if (before_mem != state->availMem / EFFECTIVE_CPU_L3_SIZE)
+               {
+                       tuplesort_sort_memtuples(state, state->memtupsortedto);
+                       state->memtupsortedto = state->memtupcount;
+               }
+       }
+
+
        if (!useAbbrev)
        {
                /*
@@ -1403,7 +1425,7 @@ tuplesort_performsort(Tuplesortstate *state)
                        if (SERIAL(state))
                        {
                                /* Just qsort 'em and we're done */
-                               tuplesort_sort_memtuples(state);
+                               tuplesort_sort_memtuples(state, 0);
                                state->status = TSS_SORTEDINMEM;
                        }
                        else if (WORKER(state))
@@ -2388,7 +2410,7 @@ dumptuples(Tuplesortstate *state, bool alltuples)
         * Sort all tuples accumulated within the allowed amount of memory for
         * this run using quicksort
         */
-       tuplesort_sort_memtuples(state);
+       tuplesort_sort_memtuples(state, 0);
 
 #ifdef TRACE_SORT
        if (trace_sort)
@@ -2413,6 +2435,7 @@ dumptuples(Tuplesortstate *state, bool alltuples)
        }
 
        state->memtupcount = 0;
+       state->memtupsortedto = 0;
 
        /*
         * Reset tuple memory.  We've freed all of the tuples that we previously
@@ -2636,6 +2659,8 @@ make_bounded_heap(Tuplesortstate *state)
        reversedirection(state);
 
        state->memtupcount = 0;         /* make the heap empty */
+       state->memtupsortedto = 0;
+
        for (i = 0; i < tupcount; i++)
        {
                if (state->memtupcount < state->bound)
@@ -2711,11 +2736,11 @@ sort_bounded_heap(Tuplesortstate *state)
  * Quicksort is used for small in-memory sorts, and external sort runs.
  */
 static void
-tuplesort_sort_memtuples(Tuplesortstate *state)
+tuplesort_sort_memtuples(Tuplesortstate *state, int start_index)
 {
        Assert(!LEADER(state));
 
-       if (state->memtupcount > 1)
+       if (state->memtupcount - start_index > 1)
        {
                /*
                 * Do we have the leading column's value or abbreviation in 
datum1,
@@ -2725,24 +2750,24 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
                {
                        if (state->base.sortKeys[0].comparator == 
ssup_datum_unsigned_cmp)
                        {
-                               qsort_tuple_unsigned(state->memtuples,
-                                                                        
state->memtupcount,
+                               
qsort_tuple_unsigned(&state->memtuples[start_index],
+                                                                        
state->memtupcount - start_index,
                                                                         state);
                                return;
                        }
 #if SIZEOF_DATUM >= 8
                        else if (state->base.sortKeys[0].comparator == 
ssup_datum_signed_cmp)
                        {
-                               qsort_tuple_signed(state->memtuples,
-                                                                  
state->memtupcount,
+                               
qsort_tuple_signed(&state->memtuples[start_index],
+                                                                  
state->memtupcount - start_index,
                                                                   state);
                                return;
                        }
 #endif
                        else if (state->base.sortKeys[0].comparator == 
ssup_datum_int32_cmp)
                        {
-                               qsort_tuple_int32(state->memtuples,
-                                                                 
state->memtupcount,
+                               
qsort_tuple_int32(&state->memtuples[start_index],
+                                                                 
state->memtupcount - start_index,
                                                                  state);
                                return;
                        }
@@ -2751,13 +2776,14 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
                /* Can we use the single-key sort function? */
                if (state->base.onlyKey != NULL)
                {
-                       qsort_ssup(state->memtuples, state->memtupcount,
+                       qsort_ssup(&state->memtuples[start_index],
+                                          state->memtupcount - start_index,
                                           state->base.onlyKey);
                }
                else
                {
-                       qsort_tuple(state->memtuples,
-                                               state->memtupcount,
+                       qsort_tuple(&state->memtuples[start_index],
+                                               state->memtupcount - 
start_index,
                                                state->base.comparetup,
                                                state);
                }

Reply via email to