Recent discussions on the threads "Double sorting split patch" and
"CUDA sorting" raised the possibility that there could be significant
performance optimisation "low-hanging fruit" picked by having the
executor treat integers and floats as a special case during sorting,
avoiding going to the trouble of calling a comparator using the
built-in SQL function machinery, and taking advantage of inlining of
the comparator, which has been shown to have a considerable
performance advantage (at least compared to a general purpose c stdlib
qsort(), that takes a function pointer as its comparator, much like
tuplesort).
I've hacked together a sloppy POC implementation in a hurry
(basically, some code is shifted around) , which is attached - I felt
that it would be useful to determine if the community feels that this
is a worth-while undertaking in advance of a business trip that I'm
leaving on tomorrow lasting until Friday, during which I will be
mostly unavailable. The patch breaks the Postgres sorting executor
node (at least when it quicksorts) for any type other than int4. I
apologise for how rough the patch is, but the code itself isn't
important right now - the idea is. I anticipate that the value
state->datumType or something similar will be set in a near future
revision, so that tuplesort_performsort will know which type-specific
optimisation it can use for the type, while falling back on the
existing generic qsort_arg + qsort_arg_comparator, and sorting won't
actually be broken.
I've been doing some preliminary testing using the dell store 2 sample
database. I increase work_mem to '50MB', to ensure that a quicksort
will be performed for sorting (otherwise, I'm using the
postgresql.conf that initdb gave me). The query is:
explain analyze select * from orderlines order by prod_id;
Once the cache has been warmed, explain analyze very consistently
reports a runtime of 123ms for this query on master/HEAD, which varies
+/- 1 ms, with a few outliers of maybe +/- 2ms. However, when I apply
this patch, that goes down to 107ms +/- 1ms at -O0. I think that
that's a pretty good start. Funnily enough, the difference/advantage
vanishes at -O2 (I'm guessing that the higher optimisation level of
GCC 4.5 hyper-corrects away the inlining, but I don't have time to
check that right now).
I imagine the version that I actually submit for patch review will
have a macro-based infrastructure for inlining the sorting of various
built-in types, initially integers and floats. It will most likely
have some other optimisations - I haven't even used a profiler yet.
This performance patch differs from most in that it's difficult in
principle to imagine a performance regression occurring.
Thoughts?
--
Peter Geoghegan http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training and Services
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 3505236..c5ac708 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -1224,6 +1224,285 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
}
}
+inline int
+compare_int4( Datum first, Datum second)
+{
+ int32 a_is = DatumGetInt32(first);
+ int32 b_is = DatumGetInt32(second);
+
+ if (a_is > b_is)
+ return 1;
+ else if (a_is == b_is)
+ return 0;
+ else
+ return -1;
+}
+
+
+/*
+ * Inline-able copy of FunctionCall2Coll() to save some cycles in sorting.
+ */
+static inline Datum
+myFunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
+{
+ FunctionCallInfoData fcinfo;
+ Datum result;
+
+ InitFunctionCallInfoData(fcinfo, flinfo, 2, collation, NULL, NULL);
+
+ fcinfo.arg[0] = arg1;
+ fcinfo.arg[1] = arg2;
+ fcinfo.argnull[0] = false;
+ fcinfo.argnull[1] = false;
+
+ result = FunctionCallInvoke(&fcinfo);
+
+ /* Check for null result, since caller is clearly not expecting one */
+ if (fcinfo.isnull)
+ elog(ERROR, "function %u returned NULL", fcinfo.flinfo->fn_oid);
+
+ return result;
+}
+/*
+ * Apply a sort function (by now converted to fmgr lookup form)
+ * and return a 3-way comparison result. This takes care of handling
+ * reverse-sort and NULLs-ordering properly. We assume that DESC and
+ * NULLS_FIRST options are encoded in sk_flags the same way btree does it.
+ */
+static inline int32
+inlineApplySortFunction(FmgrInfo *sortFunction, int sk_flags, Oid collation,
+ Datum datum1, bool isNull1,
+ Datum datum2, bool isNull2)
+{
+ int32 compare;
+
+ if (isNull1)
+ {
+ if (isNull2)
+ compare = 0; /* NULL "=" NULL */
+ else if (sk_flags & SK_BT_NULLS_FIRST)
+ compare = -1; /* NULL "<" NOT_NULL */
+ else
+ compare = 1; /* NULL ">" NOT_NULL */
+ }
+ else if (isNull2)
+ {
+ if (sk_flags & SK_BT_NULLS_FIRST)
+ compare = 1; /* NOT_NULL ">" NULL */
+ else
+ compare = -1; /* NOT_NULL "<" NULL */
+ }
+ else
+ {
+ compare = compare_int4(datum1, datum2);
+
+ if (sk_flags & SK_BT_DESC)
+ compare = -compare;
+ }
+
+ return compare;
+}
+
+
+
+inline int
+comparetup_heap_int4(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
+{
+ ScanKey scanKey = state->scanKeys;
+ HeapTupleData ltup;
+ HeapTupleData rtup;
+ TupleDesc tupDesc;
+ int nkey;
+ int32 compare;
+
+ /* Allow interrupting long sorts */
+ CHECK_FOR_INTERRUPTS();
+ /* Compare the leading sort key */
+ compare = inlineApplySortFunction(&scanKey->sk_func, scanKey->sk_flags,
+ scanKey->sk_collation,
+ a->datum1, a->isnull1,
+ b->datum1, b->isnull1);
+ if (compare != 0)
+ return compare;
+
+
+ /* Compare additional sort keys */
+ ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
+ ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET);
+ rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
+ rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
+ tupDesc = state->tupDesc;
+ scanKey++;
+
+ for (nkey = 1; nkey < state->nKeys; nkey++, scanKey++)
+ {
+ AttrNumber attno = scanKey->sk_attno;
+ Datum datum1,
+ datum2;
+ bool isnull1,
+ isnull2;
+
+ datum1 = heap_getattr(<up, attno, tupDesc, &isnull1);
+ datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
+
+ compare = compare_int4(datum1, datum2);
+ return compare;
+ }
+
+ return 0;
+}
+
+
+inline static char *med3(char *a, char *b, char *c,
+ void *arg);
+inline static void swapfunc(char *, char *, size_t, int);
+
+/*
+ * Qsort routine based on J. L. Bentley and M. D. McIlroy,
+ * "Engineering a sort function",
+ * Software--Practice and Experience 23 (1993) 1249-1265.
+ * We have modified their original by adding a check for already-sorted input,
+ * which seems to be a win per discussions on pgsql-hackers around 2006-03-21.
+ */
+#define swapcode(TYPE, parmi, parmj, n) \
+do { \
+ size_t i = (n) / sizeof (TYPE); \
+ TYPE *pi = (TYPE *)(void *)(parmi); \
+ TYPE *pj = (TYPE *)(void *)(parmj); \
+ do { \
+ TYPE t = *pi; \
+ *pi++ = *pj; \
+ *pj++ = t; \
+ } while (--i > 0); \
+} while (0)
+
+#define SWAPINIT(a, es) swaptype = ((char *)(a) - (char *)0) % sizeof(long) || \
+ (es) % sizeof(long) ? 2 : (es) == sizeof(long)? 0 : 1;
+
+inline static void
+swapfunc(char *a, char *b, size_t n, int swaptype)
+{
+ if (swaptype <= 1)
+ swapcode(long, a, b, n);
+ else
+ swapcode(char, a, b, n);
+}
+
+#define swap(a, b) \
+ if (swaptype == 0) { \
+ long t = *(long *)(void *)(a); \
+ *(long *)(void *)(a) = *(long *)(void *)(b); \
+ *(long *)(void *)(b) = t; \
+ } else \
+ swapfunc(a, b, es, swaptype)
+
+#define vecswap(a, b, n) if ((n) > 0) swapfunc((a), (b), (size_t)(n), swaptype)
+
+inline static char *
+med3(char *a, char *b, char *c, void *arg)
+{
+ return comparetup_heap_int4(a, b, arg) < 0 ?
+ (comparetup_heap_int4(b, c, arg) < 0 ? b : (comparetup_heap_int4(a, c, arg) < 0 ? c : a))
+ : (comparetup_heap_int4(b, c, arg) > 0 ? b : (comparetup_heap_int4(a, c, arg) < 0 ? a : c));
+}
+
+
+
+void
+qsort_arg_int4(void *a, size_t n, size_t es, void *arg)
+{
+ char *pa,
+ *pb,
+ *pc,
+ *pd,
+ *pl,
+ *pm,
+ *pn;
+ int d,
+ r,
+ swaptype,
+ presorted;
+
+loop:SWAPINIT(a, es);
+ if (n < 7)
+ {
+ for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
+ for (pl = pm; pl > (char *) a && comparetup_heap_int4(pl - es, pl, arg) > 0;
+ pl -= es)
+ swap(pl, pl - es);
+ return;
+ }
+ presorted = 1;
+ for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
+ {
+ if (comparetup_heap_int4(pm - es, pm, arg) > 0)
+ {
+ presorted = 0;
+ break;
+ }
+ }
+ if (presorted)
+ return;
+ pm = (char *) a + (n / 2) * es;
+ if (n > 7)
+ {
+ pl = (char *) a;
+ pn = (char *) a + (n - 1) * es;
+ if (n > 40)
+ {
+ d = (n / 8) * es;
+ pl = med3(pl, pl + d, pl + 2 * d, arg);
+ pm = med3(pm - d, pm, pm + d, arg);
+ pn = med3(pn - 2 * d, pn - d, pn, arg);
+ }
+ pm = med3(pl, pm, pn, arg);
+ }
+ swap(a, pm);
+ pa = pb = (char *) a + es;
+ pc = pd = (char *) a + (n - 1) * es;
+ for (;;)
+ {
+ while (pb <= pc && (r = comparetup_heap_int4(pb, a, arg)) <= 0)
+ {
+ if (r == 0)
+ {
+ swap(pa, pb);
+ pa += es;
+ }
+ pb += es;
+ }
+ while (pb <= pc && (r = comparetup_heap_int4(pc, a, arg)) >= 0)
+ {
+ if (r == 0)
+ {
+ swap(pc, pd);
+ pd -= es;
+ }
+ pc -= es;
+
+ }
+ if (pb > pc)
+ break;
+ swap(pb, pc);
+ pb += es;
+ pc -= es;
+ }
+ pn = (char *) a + n * es;
+ r = Min(pa - (char *) a, pb - pa);
+ vecswap(a, pb - r, r);
+ r = Min(pd - pc, pn - pd - es);
+ vecswap(pb, pn - r, r);
+ if ((r = pb - pa) > es)
+ qsort_arg_int4(a, r / es, es, arg);
+ if ((r = pd - pc) > es)
+ {
+ /* Iterate rather than recurse to save stack space */
+ a = pn - r;
+ n = r / es;
+ goto loop;
+ }
+}
+
/*
* All tuples have been provided; finish the sort.
*/
@@ -1247,11 +1526,14 @@ tuplesort_performsort(Tuplesortstate *state)
* amount of memory. Just qsort 'em and we're done.
*/
if (state->memtupcount > 1)
- qsort_arg((void *) state->memtuples,
+ {
+ /* For this POC patch, pretend we only ever sort int4 */
+ qsort_arg_int4((void *) state->memtuples,
state->memtupcount,
sizeof(SortTuple),
- (qsort_arg_comparator) state->comparetup,
(void *) state);
+
+ }
state->current = 0;
state->eof_reached = false;
state->markpos_offset = 0;
@@ -2628,72 +2910,6 @@ SelectSortFunction(Oid sortOperator,
}
/*
- * Inline-able copy of FunctionCall2Coll() to save some cycles in sorting.
- */
-static inline Datum
-myFunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
-{
- FunctionCallInfoData fcinfo;
- Datum result;
-
- InitFunctionCallInfoData(fcinfo, flinfo, 2, collation, NULL, NULL);
-
- fcinfo.arg[0] = arg1;
- fcinfo.arg[1] = arg2;
- fcinfo.argnull[0] = false;
- fcinfo.argnull[1] = false;
-
- result = FunctionCallInvoke(&fcinfo);
-
- /* Check for null result, since caller is clearly not expecting one */
- if (fcinfo.isnull)
- elog(ERROR, "function %u returned NULL", fcinfo.flinfo->fn_oid);
-
- return result;
-}
-
-/*
- * Apply a sort function (by now converted to fmgr lookup form)
- * and return a 3-way comparison result. This takes care of handling
- * reverse-sort and NULLs-ordering properly. We assume that DESC and
- * NULLS_FIRST options are encoded in sk_flags the same way btree does it.
- */
-static inline int32
-inlineApplySortFunction(FmgrInfo *sortFunction, int sk_flags, Oid collation,
- Datum datum1, bool isNull1,
- Datum datum2, bool isNull2)
-{
- int32 compare;
-
- if (isNull1)
- {
- if (isNull2)
- compare = 0; /* NULL "=" NULL */
- else if (sk_flags & SK_BT_NULLS_FIRST)
- compare = -1; /* NULL "<" NOT_NULL */
- else
- compare = 1; /* NULL ">" NOT_NULL */
- }
- else if (isNull2)
- {
- if (sk_flags & SK_BT_NULLS_FIRST)
- compare = 1; /* NOT_NULL ">" NULL */
- else
- compare = -1; /* NOT_NULL "<" NULL */
- }
- else
- {
- compare = DatumGetInt32(myFunctionCall2Coll(sortFunction, collation,
- datum1, datum2));
-
- if (sk_flags & SK_BT_DESC)
- compare = -compare;
- }
-
- return compare;
-}
-
-/*
* Non-inline ApplySortFunction() --- this is needed only to conform to
* C99's brain-dead notions about how to implement inline functions...
*/
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers