On Sat, Dec 12, 2015 at 7:42 PM, Ants Aasma <ants.aa...@eesti.ee> wrote: > As the open coding doesn't help with eliminating control flow > dependencies, so my idea is to encode the sort network comparison > order in an array and use that to drive a simple loop. The code size > would be pretty similar to insertion sort and the loop overhead should > mostly be hidden by the CPU OoO machinery. Probably won't help much, > but would be interesting and simple enough to try out. Can you share > you code for the benchmark so I can try it out?
I can. But the further results showing the number of comparisons is higher than for insertion sort have dampened my enthusiasm for the change. I'm assuming that even if it's faster for a simple integer or sort it'll be much slower for anything that requires calling out to the datatype comparator. I also hadn't actually measured what percentage of the sort was being spent in the insertion sort. I had guessed it would be higher. The test is attached. qsort_tuple.c is copied from tuplesort (with the ifdef for NOPRESORT added, but you could skip that if you want.). Compile with something like: gcc -DNOPRESORT -O3 -DCOUNTS -Wall -Wno-unused-function simd-sort-test.c -- greg
#include <stdlib.h> #include <stdio.h> #include <string.h> #include <time.h> #include <sys/time.h> #include <sys/types.h> #include <unistd.h> #ifdef COUNTS unsigned nswaps; unsigned ncmps; #define countswap (nswaps++) #define countcmp (ncmps++) #else #define countswap ((void)0) #define countcmp ((void)0) #endif typedef void *Tuplesortstate; typedef long long int Datum; typedef int bool; typedef struct { void *tuple; /* the tuple proper */ Datum datum1; /* value of first key column */ bool isnull1; /* is first key column NULL? */ int tupindex; /* see notes above */ } SortTuple; typedef SortTuple elem_t; typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, Tuplesortstate *state); typedef struct SortSupportData *SortSupport; static int comparator(Datum a, Datum b, SortSupport ssup) { if (b > a) return -1; else if (b < a) return 1; else return 0; }; struct SortSupportData { bool ssup_nulls_first; bool ssup_reverse; int (*comparator)(Datum,Datum,SortSupport); }; struct SortSupportData ssup = {0, 0, comparator}; static inline int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup) { int compare; countcmp; if (isNull1) { if (isNull2) compare = 0; /* NULL "=" NULL */ else if (ssup->ssup_nulls_first) compare = -1; /* NULL "<" NOT_NULL */ else compare = 1; /* NULL ">" NOT_NULL */ } else if (isNull2) { if (ssup->ssup_nulls_first) compare = 1; /* NOT_NULL ">" NULL */ else compare = -1; /* NOT_NULL "<" NULL */ } else { compare = (*ssup->comparator) (datum1, datum2, ssup); if (ssup->ssup_reverse) compare = -compare; } return compare; } #define CHECK_FOR_INTERRUPTS() do {} while (0) #define Min(a,b) ((a)<(b)?(a):(b)) #define Max(a,b) ((a)<(b)?(b):(a)) #include "qsort_tuple.c" #define mycmp(a,b) \ (ApplySortComparator(list[a].datum1, list[a].isnull1, \ list[b].datum1, list[b].isnull1, \ &ssup)) #define myswap(a,b) \ do { \ elem_t _tmp; \ _tmp = list[a]; \ list[a] = list[b]; \ list[b] = _tmp; \ countswap; \ } while (0) #define myswapif(a,b) \ do { \ if (mycmp(a,b) > 0) \ myswap(a,b); \ } while (0) static int insertion_ok(unsigned N) { return N<1000; } static void insertion(elem_t *list, unsigned N) { if (N > 1000) { printf("insertion sort not feasible for large N\n"); exit(1); } for (unsigned pm = 1; pm < N; pm++) for (unsigned pl = pm; pl && mycmp(pl-1, pl) > 0; pl--) myswap(pl, pl-1); } static int sort_networks_ok(unsigned N) { return N<=8; } static void sort_networks(elem_t *list, unsigned N) { if (N > 8) { printf("sort_networks only implemented for N in 0..8\n"); exit(1); } switch(N) { case 0: case 1: break; case 2: myswapif(0,1); break; case 3: myswapif(0,1); myswapif(1,2); myswapif(0,1); break; case 4: myswapif(0,1); myswapif(2,3); myswapif(1,3); myswapif(0,2); myswapif(1,2); break; case 5: myswapif(1,2); myswapif(3,4); myswapif(1,3); myswapif(0,2); myswapif(2,4); myswapif(0,3); myswapif(0,1); myswapif(2,3); myswapif(1,2); break; case 6: myswapif(0,1); myswapif(2,3); myswapif(4,5); myswapif(0,2); myswapif(3,5); myswapif(1,4); myswapif(0,1); myswapif(2,3); myswapif(4,5); myswapif(1,2); myswapif(3,4); myswapif(2,3); break; case 7: myswapif(1,2); myswapif(3,4); myswapif(5,6); myswapif(0,2); myswapif(4,6); myswapif(3,5); myswapif(2,6); myswapif(1,5); myswapif(0,4); myswapif(2,5); myswapif(0,3); myswapif(2,4); myswapif(1,3); myswapif(0,1); myswapif(2,3); myswapif(4,5); break; case 8: myswapif(0, 1); myswapif(2, 3); myswapif(4, 5); myswapif(6, 7); myswapif(0, 3); myswapif(1, 2); myswapif(4, 7); myswapif(5, 6); myswapif(0, 1); myswapif(2, 3); myswapif(4, 5); myswapif(6, 7); myswapif(0, 7); myswapif(1, 6); myswapif(2, 5); myswapif(3, 4); myswapif(1, 3); myswapif(0, 2); myswapif(5, 7); myswapif(4, 6); myswapif(0, 1); myswapif(2, 3); myswapif(4, 5); myswapif(6, 7); break; } } static int bitonic_ok(unsigned N) { return (N&(N-1))==0; } static void bitonic_serial(elem_t *list, unsigned N) { if (N & (N-1)) { printf("bitonic sort implemented only for powers of 2\n"); exit(1); } int i,j,k; for (k=2;k<=N;k=2*k) { for (j=k>>1;j>0;j=j>>1) { for (i=0;i<N;i++) { int ixj=i^j; if (ixj > i) { if ((i & k) == 0) myswapif(i,ixj); if ((i & k) != 0) myswapif(ixj, i); } } } } } static int qsort_compare(const void *_a, const void *_b) { const elem_t *a = _a, *b = _b; return ApplySortComparator(a->datum1, a->isnull1, b->datum1, b->isnull1, &ssup); } static void quicksort(elem_t *list, unsigned N) { qsort(list, N, sizeof(elem_t), qsort_compare); } static void test_qsort_ssup(elem_t *list, unsigned N) { qsort_ssup(list, N, &ssup); } static elem_t *generate_list(unsigned N) { elem_t *list = aligned_alloc(256, sizeof(elem_t) * N); memset(list, 0, sizeof(elem_t) * N); for (unsigned i=0 ; i<N; i++) { SortTuple x = {NULL, 0, 0, 0}; x.datum1 = (Datum)rand() << (8*(sizeof(x.datum1) - sizeof(int))); list[i] = x; } return list; } static void time_sort(char *label, unsigned seed, unsigned N, unsigned M, void (*sort)(elem_t*, unsigned)) { unsigned reps = M/N; elem_t **lists = malloc(sizeof(elem_t*) * reps); srand(seed); for (unsigned i=0; i<reps; i++) lists[i] = generate_list(N); #ifdef COUNTS nswaps = 0; ncmps = 0; #endif struct timespec t1, t2; clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t1); for (unsigned i=0;i<reps; i++) sort(lists[i], N); clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t2); for (unsigned i=0; i<reps; i++) free(lists[i]); free(lists); double elapsed = ((double) (t2.tv_sec - t1.tv_sec) * 1000 * 1000 * 1000 + (t2.tv_nsec - t1.tv_nsec)) / reps; char *units = "ns"; if (elapsed > 1000) { elapsed /= 1000; units = "us"; if (elapsed > 1000) { elapsed /= 1000; units = "ms"; if (elapsed > 1000) { elapsed /= 1000; units = "s"; if (elapsed > 119) { elapsed /= 60; units = "m"; if (elapsed > 119) { elapsed /= 60; units = "h"; } } } } } printf("using %s sort %0.3f%s per sort of %u %lu-byte items", label, elapsed, units, N, sizeof(elem_t)/sizeof(char)); #ifdef COUNTS if (ncmps) printf(" %0.1f compares/sort", (double)ncmps/reps); if (nswaps) printf(" %0.1f swaps/sort", (double)nswaps/reps); #endif printf("\n"); } struct sort { char *label; void (*func)(elem_t*, unsigned); int (*ok)(unsigned); } sorts[] = { {"bitonic", bitonic_serial, bitonic_ok}, {"insertion", insertion, insertion_ok}, {"sort networks", sort_networks, sort_networks_ok}, {"libc quicksort", quicksort, NULL}, {"qsort_ssup", test_qsort_ssup, NULL}, }; #define NSORTS (sizeof(sorts)/sizeof(*sorts)) static void time_several_sorts(unsigned N, unsigned M) { struct timeval tv; gettimeofday(&tv, NULL); unsigned seed = getpid() ^ tv.tv_sec ^ tv.tv_usec; srand(seed); seed = rand(); for (unsigned i=0; i<NSORTS; i++) { if (!sorts[i].ok || sorts[i].ok(N)) time_sort(sorts[i].label, seed, N, M, sorts[i].func); } } int main (int argc, char *argv[], char *envp[]) { unsigned N; if (argc > 1) { N = atoi(argv[1]); } else { N = 16384; } time_several_sorts(N, 1<<23); }
/* * autogenerated by src/backend/utils/sort/gen_qsort_tuple.pl, do not edit! * * This file is included by tuplesort.c, rather than compiled separately. */ /* $NetBSD: qsort.c,v 1.13 2003/08/07 16:43:42 agc Exp $ */ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Qsort routine based on J. L. Bentley and M. D. McIlroy, * "Engineering a sort function", * Software--Practice and Experience 23 (1993) 1249-1265. * * We have modified their original by adding a check for already-sorted input, * which seems to be a win per discussions on pgsql-hackers around 2006-03-21. * * Also, we recurse on the smaller partition and iterate on the larger one, * which ensures we cannot recurse more than log(N) levels (since the * partition recursed to is surely no more than half of the input). Bentley * and McIlroy explicitly rejected doing this on the grounds that it's "not * worth the effort", but we have seen crashes in the field due to stack * overrun, so that judgment seems wrong. */ static void swapfunc(SortTuple *a, SortTuple *b, size_t n) { do { SortTuple t = *a; *a++ = *b; *b++ = t; } while (--n > 0); } #define swap(a, b) \ do { \ SortTuple t = *(a); \ *(a) = *(b); \ *(b) = t; \ } while (0); #define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n) static SortTuple * med3_tuple(SortTuple *a, SortTuple *b, SortTuple *c, SortTupleComparator cmp_tuple, Tuplesortstate *state) { return cmp_tuple(a, b, state) < 0 ? (cmp_tuple(b, c, state) < 0 ? b : (cmp_tuple(a, c, state) < 0 ? c : a)) : (cmp_tuple(b, c, state) > 0 ? b : (cmp_tuple(a, c, state) < 0 ? a : c)); } static void qsort_tuple(SortTuple *a, size_t n, SortTupleComparator cmp_tuple, Tuplesortstate *state) { SortTuple *pa, *pb, *pc, *pd, *pl, *pm, *pn; size_t d1, d2; int r, presorted; loop: CHECK_FOR_INTERRUPTS(); if (n <= 8) { #define swapif(l,r) if (cmp_tuple(a+l,a+r, state) > 0) swap(a+l,a+r) switch(n) { case 0: case 1: break; case 2: swapif(0,1); break; case 3: swapif(0,1); swapif(1,2); swapif(0,1); break; case 4: swapif(0,1); swapif(2,3); swapif(1,3); swapif(0,2); swapif(1,2); break; case 5: swapif(1,2); swapif(3,4); swapif(1,3); swapif(0,2); swapif(2,4); swapif(0,3); swapif(0,1); swapif(2,3); swapif(1,2); break; case 6: swapif(0,1); swapif(2,3); swapif(4,5); swapif(0,2); swapif(3,5); swapif(1,4); swapif(0,1); swapif(2,3); swapif(4,5); swapif(1,2); swapif(3,4); swapif(2,3); break; case 7: swapif(1,2); swapif(3,4); swapif(5,6); swapif(0,2); swapif(4,6); swapif(3,5); swapif(2,6); swapif(1,5); swapif(0,4); swapif(2,5); swapif(0,3); swapif(2,4); swapif(1,3); swapif(0,1); swapif(2,3); swapif(4,5); break; case 8: swapif(0, 1); swapif(2, 3); swapif(4, 5); swapif(6, 7); swapif(0, 3); swapif(1, 2); swapif(4, 7); swapif(5, 6); swapif(0, 1); swapif(2, 3); swapif(4, 5); swapif(6, 7); swapif(0, 7); swapif(1, 6); swapif(2, 5); swapif(3, 4); swapif(1, 3); swapif(0, 2); swapif(5, 7); swapif(4, 6); swapif(0, 1); swapif(2, 3); swapif(4, 5); swapif(6, 7); break; } return; #undef swapif } presorted = 1; for (pm = a + 1; pm < a + n; pm++) { CHECK_FOR_INTERRUPTS(); if (cmp_tuple(pm - 1, pm, state) > 0) { presorted = 0; break; } } if (presorted) return; pm = a + (n / 2); if (n > 7) { pl = a; pn = a + (n - 1); if (n > 40) { size_t d = (n / 8); pl = med3_tuple(pl, pl + d, pl + 2 * d, cmp_tuple, state); pm = med3_tuple(pm - d, pm, pm + d, cmp_tuple, state); pn = med3_tuple(pn - 2 * d, pn - d, pn, cmp_tuple, state); } pm = med3_tuple(pl, pm, pn, cmp_tuple, state); } swap(a, pm); pa = pb = a + 1; pc = pd = a + (n - 1); for (;;) { while (pb <= pc && (r = cmp_tuple(pb, a, state)) <= 0) { if (r == 0) { swap(pa, pb); pa++; } pb++; CHECK_FOR_INTERRUPTS(); } while (pb <= pc && (r = cmp_tuple(pc, a, state)) >= 0) { if (r == 0) { swap(pc, pd); pd--; } pc--; CHECK_FOR_INTERRUPTS(); } if (pb > pc) break; swap(pb, pc); pb++; pc--; } pn = a + n; d1 = Min(pa - a, pb - pa); vecswap(a, pb - d1, d1); d1 = Min(pd - pc, pn - pd - 1); vecswap(pb, pn - d1, d1); d1 = pb - pa; d2 = pd - pc; if (d1 <= d2) { /* Recurse on left partition, then iterate on right partition */ if (d1 > 1) qsort_tuple(a, d1, cmp_tuple, state); if (d2 > 1) { /* Iterate rather than recurse to save stack space */ /* qsort_tuple(pn - d2, d2, cmp_tuple, state); */ a = pn - d2; n = d2; goto loop; } } else { /* Recurse on right partition, then iterate on left partition */ if (d2 > 1) qsort_tuple(pn - d2, d2, cmp_tuple, state); if (d1 > 1) { /* Iterate rather than recurse to save stack space */ /* qsort_tuple(a, d1, cmp_tuple, state); */ n = d1; goto loop; } } } #define cmp_ssup(a, b, ssup) \ ApplySortComparator((a)->datum1, (a)->isnull1, \ (b)->datum1, (b)->isnull1, ssup) static SortTuple * med3_ssup(SortTuple *a, SortTuple *b, SortTuple *c, SortSupport ssup) { return cmp_ssup(a, b, ssup) < 0 ? (cmp_ssup(b, c, ssup) < 0 ? b : (cmp_ssup(a, c, ssup) < 0 ? c : a)) : (cmp_ssup(b, c, ssup) > 0 ? b : (cmp_ssup(a, c, ssup) < 0 ? a : c)); } static void qsort_ssup(SortTuple *a, size_t n, SortSupport ssup) { SortTuple *pa, *pb, *pc, *pd, *pl, *pm, *pn; size_t d1, d2; int r; loop: CHECK_FOR_INTERRUPTS(); if (n < 7) { for (pm = a + 1; pm < a + n; pm++) for (pl = pm; pl > a && cmp_ssup(pl - 1, pl, ssup) > 0; pl--) swap(pl, pl - 1); return; } #ifndef NOPRESORT int presorted = 1; for (pm = a + 1; pm < a + n; pm++) { CHECK_FOR_INTERRUPTS(); if (cmp_ssup(pm - 1, pm, ssup) > 0) { presorted = 0; break; } } if (presorted) return; #endif pm = a + (n / 2); if (n > 7) { pl = a; pn = a + (n - 1); if (n > 40) { size_t d = (n / 8); pl = med3_ssup(pl, pl + d, pl + 2 * d, ssup); pm = med3_ssup(pm - d, pm, pm + d, ssup); pn = med3_ssup(pn - 2 * d, pn - d, pn, ssup); } pm = med3_ssup(pl, pm, pn, ssup); } swap(a, pm); pa = pb = a + 1; pc = pd = a + (n - 1); for (;;) { while (pb <= pc && (r = cmp_ssup(pb, a, ssup)) <= 0) { if (r == 0) { swap(pa, pb); pa++; } pb++; CHECK_FOR_INTERRUPTS(); } while (pb <= pc && (r = cmp_ssup(pc, a, ssup)) >= 0) { if (r == 0) { swap(pc, pd); pd--; } pc--; CHECK_FOR_INTERRUPTS(); } if (pb > pc) break; swap(pb, pc); pb++; pc--; } pn = a + n; d1 = Min(pa - a, pb - pa); vecswap(a, pb - d1, d1); d1 = Min(pd - pc, pn - pd - 1); vecswap(pb, pn - d1, d1); d1 = pb - pa; d2 = pd - pc; if (d1 <= d2) { /* Recurse on left partition, then iterate on right partition */ if (d1 > 1) qsort_ssup(a, d1, ssup); if (d2 > 1) { /* Iterate rather than recurse to save stack space */ /* qsort_ssup(pn - d2, d2, ssup); */ a = pn - d2; n = d2; goto loop; } } else { /* Recurse on right partition, then iterate on left partition */ if (d2 > 1) qsort_ssup(pn - d2, d2, ssup); if (d1 > 1) { /* Iterate rather than recurse to save stack space */ /* qsort_ssup(a, d1, ssup); */ n = d1; goto loop; } } }
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers