On Sat, Dec 12, 2015 at 7:42 PM, Ants Aasma <ants.aa...@eesti.ee> wrote:
> As the open coding doesn't help with eliminating control flow
> dependencies, so my idea is to encode the sort network comparison
> order in an array and use that to drive a simple loop. The code size
> would be pretty similar to insertion sort and the loop overhead should
> mostly be hidden by the CPU OoO machinery. Probably won't help much,
> but would be interesting and simple enough to try out. Can you share
> you code for the benchmark so I can try it out?

I can. But the further results showing the number of comparisons is
higher than for insertion sort have dampened my enthusiasm for the
change. I'm assuming that even if it's faster for a simple integer or
sort it'll be much slower for anything that requires calling out to
the datatype comparator. I also hadn't actually measured what
percentage of the sort was being spent in the insertion sort. I had
guessed it would be higher.

The test is attached. qsort_tuple.c is copied from tuplesort (with the
ifdef for NOPRESORT added, but you could skip that if you want.).
Compile with something like:

gcc -DNOPRESORT -O3 -DCOUNTS -Wall -Wno-unused-function simd-sort-test.c


-- 
greg
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>

#ifdef COUNTS
unsigned nswaps;
unsigned ncmps;
#define countswap (nswaps++)
#define countcmp  (ncmps++)
#else
#define countswap ((void)0)
#define countcmp  ((void)0)
#endif

typedef void *Tuplesortstate;
typedef long long int Datum;
typedef int  bool;

typedef struct
{
	void	   *tuple;			/* the tuple proper */
	Datum		datum1;			/* value of first key column */
	bool		isnull1;		/* is first key column NULL? */
	int			tupindex;		/* see notes above */
} SortTuple;

typedef SortTuple elem_t;

typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
									Tuplesortstate *state);

typedef struct SortSupportData *SortSupport;

static int comparator(Datum a, Datum b, SortSupport ssup) {
	if (b > a)
		return -1;
	else if (b < a)
		return 1;
	else return 0;
};

struct SortSupportData {
	bool ssup_nulls_first;
	bool ssup_reverse;
	int (*comparator)(Datum,Datum,SortSupport);
};

struct SortSupportData ssup = {0, 0, comparator};

static inline int
ApplySortComparator(Datum datum1, bool isNull1,
					Datum datum2, bool isNull2,
					SortSupport ssup)
{
	int			compare;

	countcmp;

	if (isNull1)
	{
		if (isNull2)
			compare = 0;		/* NULL "=" NULL */
		else if (ssup->ssup_nulls_first)
			compare = -1;		/* NULL "<" NOT_NULL */
		else
			compare = 1;		/* NULL ">" NOT_NULL */
	}
	else if (isNull2)
	{
		if (ssup->ssup_nulls_first)
			compare = 1;		/* NOT_NULL ">" NULL */
		else
			compare = -1;		/* NOT_NULL "<" NULL */
	}
	else
	{
		compare = (*ssup->comparator) (datum1, datum2, ssup);
		if (ssup->ssup_reverse)
			compare = -compare;
	}

	return compare;
}

#define CHECK_FOR_INTERRUPTS() do {} while (0)

#define Min(a,b) ((a)<(b)?(a):(b))
#define Max(a,b) ((a)<(b)?(b):(a))

#include "qsort_tuple.c"

#define mycmp(a,b)					\
	(ApplySortComparator(list[a].datum1, list[a].isnull1, \
						 list[b].datum1, list[b].isnull1, \
						 &ssup))

#define myswap(a,b)					\
	do {							\
		elem_t _tmp;				\
		_tmp = list[a];				\
		list[a] = list[b];			\
		list[b] = _tmp;				\
		countswap;					\
	} while (0)

#define myswapif(a,b)				\
	do {							\
		if (mycmp(a,b) > 0)			\
			myswap(a,b);				\
	} while (0)


static int insertion_ok(unsigned N) {
	return N<1000;
}

static void insertion(elem_t *list, unsigned N) {
	if (N > 1000) {
		printf("insertion sort not feasible for large N\n");
		exit(1);
	}

	for (unsigned pm = 1; pm < N; pm++)
		for (unsigned pl = pm; pl && mycmp(pl-1, pl) > 0; pl--)
			myswap(pl, pl-1);
}

static int sort_networks_ok(unsigned N) {
	return N<=8;
}

static void sort_networks(elem_t *list, unsigned N) {
	if (N > 8) {
		printf("sort_networks only implemented for N in 0..8\n");
		exit(1);
	}
	switch(N) {
	case 0: 
	case 1:
		break;
		
	case 2:
		myswapif(0,1);
		break;
		
	case 3:
		myswapif(0,1); myswapif(1,2);
		myswapif(0,1);
		break;
		
	case 4:
		myswapif(0,1); myswapif(2,3);
		myswapif(1,3); myswapif(0,2);
		myswapif(1,2);
		break;
		
	case 5:
		myswapif(1,2); myswapif(3,4);
		myswapif(1,3); myswapif(0,2);
		myswapif(2,4); myswapif(0,3);
		myswapif(0,1); myswapif(2,3);
		myswapif(1,2);
		break;
		
	case 6:
		myswapif(0,1); myswapif(2,3); myswapif(4,5);
		myswapif(0,2); myswapif(3,5); myswapif(1,4);
		myswapif(0,1); myswapif(2,3); myswapif(4,5);
		myswapif(1,2); myswapif(3,4);
		myswapif(2,3);
		break;
		
	case 7:
		myswapif(1,2); myswapif(3,4); myswapif(5,6);
		myswapif(0,2); myswapif(4,6); myswapif(3,5);
		myswapif(2,6); myswapif(1,5); myswapif(0,4);
		myswapif(2,5); myswapif(0,3);
		myswapif(2,4); myswapif(1,3);
		myswapif(0,1); myswapif(2,3); myswapif(4,5);
		break;
		
	case 8:
		myswapif(0, 1); myswapif(2, 3); myswapif(4, 5); myswapif(6, 7);
		myswapif(0, 3); myswapif(1, 2); myswapif(4, 7); myswapif(5, 6);
		myswapif(0, 1); myswapif(2, 3); myswapif(4, 5); myswapif(6, 7);
		myswapif(0, 7); myswapif(1, 6); myswapif(2, 5); myswapif(3, 4);
		myswapif(1, 3); myswapif(0, 2); myswapif(5, 7); myswapif(4, 6);
		myswapif(0, 1); myswapif(2, 3); myswapif(4, 5); myswapif(6, 7);
		break;
	}
}

static int bitonic_ok(unsigned N) {
	return (N&(N-1))==0;
}

static void bitonic_serial(elem_t *list, unsigned N) {

	if (N & (N-1)) {
		printf("bitonic sort implemented only for powers of 2\n");
		exit(1);
	}

    int i,j,k;
    for (k=2;k<=N;k=2*k) {
		for (j=k>>1;j>0;j=j>>1) {
			for (i=0;i<N;i++) {
				int ixj=i^j;
				if (ixj > i) {
					if ((i & k) == 0)
						myswapif(i,ixj);
					if ((i & k) != 0)
						myswapif(ixj, i);
				}
			}
		}
    }
}

static int qsort_compare(const void *_a, const void *_b) {
	const elem_t *a = _a, *b = _b;

	return ApplySortComparator(a->datum1, a->isnull1,
							   b->datum1, b->isnull1,
							   &ssup);
}

static void quicksort(elem_t *list, unsigned N) {
	qsort(list, N, sizeof(elem_t), qsort_compare);
}


static void test_qsort_ssup(elem_t *list, unsigned N) {
	qsort_ssup(list, N, &ssup);
}

static elem_t *generate_list(unsigned N) {
	elem_t *list = aligned_alloc(256, sizeof(elem_t) * N);
    memset(list, 0, sizeof(elem_t) * N);
	for (unsigned i=0 ; i<N; i++) {
		SortTuple x = {NULL, 0, 0, 0};
		x.datum1 =  (Datum)rand() << (8*(sizeof(x.datum1) - sizeof(int)));
		list[i] = x;
	}

	return list;
}	


static void time_sort(char *label, unsigned seed, unsigned N, unsigned M, void (*sort)(elem_t*, unsigned)) {
	unsigned reps = M/N;
	elem_t **lists = malloc(sizeof(elem_t*) * reps);

	srand(seed);
	for (unsigned i=0; i<reps; i++)
		lists[i] = generate_list(N);

#ifdef COUNTS
	nswaps = 0;
	ncmps = 0;
#endif

	struct timespec t1, t2;
	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t1);
	for (unsigned i=0;i<reps; i++)
		sort(lists[i], N);
	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t2);

	for (unsigned i=0; i<reps; i++)
		free(lists[i]);
	free(lists);

	double elapsed = ((double)
					  (t2.tv_sec - t1.tv_sec) * 1000 * 1000 * 1000 +
					  (t2.tv_nsec - t1.tv_nsec)) / reps;

	char *units = "ns";
	if (elapsed > 1000) {
		elapsed /= 1000;
		units = "us";
		if (elapsed > 1000) {
			elapsed /= 1000;
			units = "ms";
			if (elapsed > 1000) {
				elapsed /= 1000;
				units = "s";
				if (elapsed > 119) {
					elapsed /= 60;
					units = "m";
					if (elapsed > 119) {
						elapsed /= 60;
						units = "h";
					}
				}
			}
		}
	}

	printf("using %s sort %0.3f%s per sort of %u %lu-byte items",
		   label,
		   elapsed, units,
		   N, sizeof(elem_t)/sizeof(char));
#ifdef COUNTS
	if (ncmps)
		printf(" %0.1f compares/sort", (double)ncmps/reps);
	if (nswaps)
		printf(" %0.1f swaps/sort", (double)nswaps/reps);
#endif
	printf("\n");
}

struct sort {
	char *label;
	void (*func)(elem_t*, unsigned);
	int (*ok)(unsigned);
} sorts[] = {
	{"bitonic", bitonic_serial, bitonic_ok},
	{"insertion", insertion, insertion_ok},
	{"sort networks", sort_networks, sort_networks_ok},
	{"libc quicksort", quicksort, NULL},
	{"qsort_ssup", test_qsort_ssup, NULL},

};

#define NSORTS (sizeof(sorts)/sizeof(*sorts))

static void time_several_sorts(unsigned N, unsigned M) {
	struct timeval tv;
	gettimeofday(&tv, NULL);
	unsigned seed = getpid() ^ tv.tv_sec ^ tv.tv_usec;
	srand(seed);

	seed = rand();
	for (unsigned i=0; i<NSORTS; i++) {
		if (!sorts[i].ok || sorts[i].ok(N))
			time_sort(sorts[i].label, seed, N, M, sorts[i].func);
	}
}

int main (int argc, char *argv[], char *envp[]) {
	unsigned N;
	if (argc > 1) {
		N = atoi(argv[1]);
	} else {
		N = 16384;
	}


	time_several_sorts(N, 1<<23);
}
/*
 * autogenerated by src/backend/utils/sort/gen_qsort_tuple.pl, do not edit!
 *
 * This file is included by tuplesort.c, rather than compiled separately.
 */

/*	$NetBSD: qsort.c,v 1.13 2003/08/07 16:43:42 agc Exp $	*/

/*-
 * Copyright (c) 1992, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *	  notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *	  notice, this list of conditions and the following disclaimer in the
 *	  documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *	  may be used to endorse or promote products derived from this software
 *	  without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Qsort routine based on J. L. Bentley and M. D. McIlroy,
 * "Engineering a sort function",
 * Software--Practice and Experience 23 (1993) 1249-1265.
 *
 * We have modified their original by adding a check for already-sorted input,
 * which seems to be a win per discussions on pgsql-hackers around 2006-03-21.
 *
 * Also, we recurse on the smaller partition and iterate on the larger one,
 * which ensures we cannot recurse more than log(N) levels (since the
 * partition recursed to is surely no more than half of the input).  Bentley
 * and McIlroy explicitly rejected doing this on the grounds that it's "not
 * worth the effort", but we have seen crashes in the field due to stack
 * overrun, so that judgment seems wrong.
 */

static void
swapfunc(SortTuple *a, SortTuple *b, size_t n)
{
	do
	{
		SortTuple 	t = *a;
		*a++ = *b;
		*b++ = t;
	} while (--n > 0);
}

#define swap(a, b)						\
	do { 								\
		SortTuple t = *(a);				\
		*(a) = *(b);					\
		*(b) = t;						\
	} while (0);

#define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n)

static SortTuple *
med3_tuple(SortTuple *a, SortTuple *b, SortTuple *c, SortTupleComparator cmp_tuple, Tuplesortstate *state)
{
	return cmp_tuple(a, b, state) < 0 ?
		(cmp_tuple(b, c, state) < 0 ? b :
			(cmp_tuple(a, c, state) < 0 ? c : a))
		: (cmp_tuple(b, c, state) > 0 ? b :
			(cmp_tuple(a, c, state) < 0 ? a : c));
}

static void
qsort_tuple(SortTuple *a, size_t n, SortTupleComparator cmp_tuple, Tuplesortstate *state)
{
	SortTuple  *pa,
			   *pb,
			   *pc,
			   *pd,
			   *pl,
			   *pm,
			   *pn;
	size_t		d1,
				d2;
	int			r,
				presorted;

loop:
	CHECK_FOR_INTERRUPTS();
	if (n <= 8) {
		#define swapif(l,r) if (cmp_tuple(a+l,a+r, state) > 0) swap(a+l,a+r)
		switch(n) {
			case 0: 
			case 1:
			break;

			case 2:
			swapif(0,1);
			break;

			case 3:
			swapif(0,1); swapif(1,2);
			swapif(0,1);
			break;

			case 4:
			swapif(0,1); swapif(2,3);
			swapif(1,3); swapif(0,2);
			swapif(1,2);
			break;

			case 5:
			swapif(1,2); swapif(3,4);
			swapif(1,3); swapif(0,2);
			swapif(2,4); swapif(0,3);
			swapif(0,1); swapif(2,3);
			swapif(1,2);
			break;

			case 6:
			swapif(0,1); swapif(2,3); swapif(4,5);
			swapif(0,2); swapif(3,5); swapif(1,4);
			swapif(0,1); swapif(2,3); swapif(4,5);
			swapif(1,2); swapif(3,4);
			swapif(2,3);
			break;

			case 7:
			swapif(1,2); swapif(3,4); swapif(5,6);
			swapif(0,2); swapif(4,6); swapif(3,5);
			swapif(2,6); swapif(1,5); swapif(0,4);
			swapif(2,5); swapif(0,3);
			swapif(2,4); swapif(1,3);
			swapif(0,1); swapif(2,3); swapif(4,5);
			break;

			case 8:
			swapif(0, 1); swapif(2, 3); swapif(4, 5); swapif(6, 7);
			swapif(0, 3); swapif(1, 2); swapif(4, 7); swapif(5, 6);
			swapif(0, 1); swapif(2, 3); swapif(4, 5); swapif(6, 7);
			swapif(0, 7); swapif(1, 6); swapif(2, 5); swapif(3, 4);
			swapif(1, 3); swapif(0, 2); swapif(5, 7); swapif(4, 6);
			swapif(0, 1); swapif(2, 3); swapif(4, 5); swapif(6, 7);
			break;
		}
		return;
		#undef swapif
	}
	presorted = 1;
	for (pm = a + 1; pm < a + n; pm++)
	{
		CHECK_FOR_INTERRUPTS();
		if (cmp_tuple(pm - 1, pm, state) > 0)
		{
			presorted = 0;
			break;
		}
	}
	if (presorted)
		return;
	pm = a + (n / 2);
	if (n > 7)
	{
		pl = a;
		pn = a + (n - 1);
		if (n > 40)
		{
			size_t		d = (n / 8);

			pl = med3_tuple(pl, pl + d, pl + 2 * d, cmp_tuple, state);
			pm = med3_tuple(pm - d, pm, pm + d, cmp_tuple, state);
			pn = med3_tuple(pn - 2 * d, pn - d, pn, cmp_tuple, state);
		}
		pm = med3_tuple(pl, pm, pn, cmp_tuple, state);
	}
	swap(a, pm);
	pa = pb = a + 1;
	pc = pd = a + (n - 1);
	for (;;)
	{
		while (pb <= pc && (r = cmp_tuple(pb, a, state)) <= 0)
		{
			if (r == 0)
			{
				swap(pa, pb);
				pa++;
			}
			pb++;
			CHECK_FOR_INTERRUPTS();
		}
		while (pb <= pc && (r = cmp_tuple(pc, a, state)) >= 0)
		{
			if (r == 0)
			{
				swap(pc, pd);
				pd--;
			}
			pc--;
			CHECK_FOR_INTERRUPTS();
		}
		if (pb > pc)
			break;
		swap(pb, pc);
		pb++;
		pc--;
	}
	pn = a + n;
	d1 = Min(pa - a, pb - pa);
	vecswap(a, pb - d1, d1);
	d1 = Min(pd - pc, pn - pd - 1);
	vecswap(pb, pn - d1, d1);
	d1 = pb - pa;
	d2 = pd - pc;
	if (d1 <= d2)
	{
		/* Recurse on left partition, then iterate on right partition */
		if (d1 > 1)
			qsort_tuple(a, d1, cmp_tuple, state);
		if (d2 > 1)
		{
			/* Iterate rather than recurse to save stack space */
			/* qsort_tuple(pn - d2, d2, cmp_tuple, state); */
			a = pn - d2;
			n = d2;
			goto loop;
		}
	}
	else
	{
		/* Recurse on right partition, then iterate on left partition */
		if (d2 > 1)
			qsort_tuple(pn - d2, d2, cmp_tuple, state);
		if (d1 > 1)
		{
			/* Iterate rather than recurse to save stack space */
			/* qsort_tuple(a, d1, cmp_tuple, state); */
			n = d1;
			goto loop;
		}
	}
}

#define cmp_ssup(a, b, ssup) \
	ApplySortComparator((a)->datum1, (a)->isnull1, \
						(b)->datum1, (b)->isnull1, ssup)

static SortTuple *
med3_ssup(SortTuple *a, SortTuple *b, SortTuple *c, SortSupport ssup)
{
	return cmp_ssup(a, b, ssup) < 0 ?
		(cmp_ssup(b, c, ssup) < 0 ? b :
			(cmp_ssup(a, c, ssup) < 0 ? c : a))
		: (cmp_ssup(b, c, ssup) > 0 ? b :
			(cmp_ssup(a, c, ssup) < 0 ? a : c));
}

static void
qsort_ssup(SortTuple *a, size_t n, SortSupport ssup)
{
	SortTuple  *pa,
			   *pb,
			   *pc,
			   *pd,
			   *pl,
			   *pm,
			   *pn;
	size_t		d1,
				d2;
	int			r;

loop:
	CHECK_FOR_INTERRUPTS();
	if (n < 7)
	{
		for (pm = a + 1; pm < a + n; pm++)
			for (pl = pm; pl > a && cmp_ssup(pl - 1, pl, ssup) > 0; pl--)
				swap(pl, pl - 1);
		return;
	}
#ifndef NOPRESORT
	int presorted = 1;
	for (pm = a + 1; pm < a + n; pm++)
	{
		CHECK_FOR_INTERRUPTS();
		if (cmp_ssup(pm - 1, pm, ssup) > 0)
		{
			presorted = 0;
			break;
		}
	}
	if (presorted)
		return;
#endif
	pm = a + (n / 2);
	if (n > 7)
	{
		pl = a;
		pn = a + (n - 1);
		if (n > 40)
		{
			size_t		d = (n / 8);

			pl = med3_ssup(pl, pl + d, pl + 2 * d, ssup);
			pm = med3_ssup(pm - d, pm, pm + d, ssup);
			pn = med3_ssup(pn - 2 * d, pn - d, pn, ssup);
		}
		pm = med3_ssup(pl, pm, pn, ssup);
	}
	swap(a, pm);
	pa = pb = a + 1;
	pc = pd = a + (n - 1);
	for (;;)
	{
		while (pb <= pc && (r = cmp_ssup(pb, a, ssup)) <= 0)
		{
			if (r == 0)
			{
				swap(pa, pb);
				pa++;
			}
			pb++;
			CHECK_FOR_INTERRUPTS();
		}
		while (pb <= pc && (r = cmp_ssup(pc, a, ssup)) >= 0)
		{
			if (r == 0)
			{
				swap(pc, pd);
				pd--;
			}
			pc--;
			CHECK_FOR_INTERRUPTS();
		}
		if (pb > pc)
			break;
		swap(pb, pc);
		pb++;
		pc--;
	}
	pn = a + n;
	d1 = Min(pa - a, pb - pa);
	vecswap(a, pb - d1, d1);
	d1 = Min(pd - pc, pn - pd - 1);
	vecswap(pb, pn - d1, d1);
	d1 = pb - pa;
	d2 = pd - pc;
	if (d1 <= d2)
	{
		/* Recurse on left partition, then iterate on right partition */
		if (d1 > 1)
			qsort_ssup(a, d1, ssup);
		if (d2 > 1)
		{
			/* Iterate rather than recurse to save stack space */
			/* qsort_ssup(pn - d2, d2, ssup); */
			a = pn - d2;
			n = d2;
			goto loop;
		}
	}
	else
	{
		/* Recurse on right partition, then iterate on left partition */
		if (d2 > 1)
			qsort_ssup(pn - d2, d2, ssup);
		if (d1 > 1)
		{
			/* Iterate rather than recurse to save stack space */
			/* qsort_ssup(a, d1, ssup); */
			n = d1;
			goto loop;
		}
	}
}
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to