On Tue, Oct 6, 2015 at 1:15 PM, Robert Haas <robertmh...@gmail.com> wrote:
> +        tmp = ((uint32) res ^ (uint32) ((uint64) res >> 32));
>
> The outer set of parentheses here seems pretty worthless.  Perhaps one
> of the parentheses at the end of the statement should be moved to just
> after "res".  That seems like it would add considerably clarity.

This is more or less lifted from numeric_abbrev_convert_var(). Perhaps
you should change it there too. The extra set of parenthesis are
removed in the attached patch. The patch also mechanically updates
things to be consistent with the text changes on the text thread [1]
-- I had to rebase.

>> This patch is not all that exciting -- the techniques used here are
>> very simple, and it's all familiar territory for us. I expect that
>> this patch will not be at all contentious when someone eventually gets
>> around to reviewing it.
>
> There is no doubt that we need more reviewers.

I'm trying to do review following my burst of productivity on sorting
(especially external sorting) -- I should manage to do a bunch of
patch review when I return from vacation in about a month (I leave in
a couple of weeks). I'm currently privately helping a new contributor
with a large project in its early stages, so that is something.
Perhaps you'll hear more about that before too long.

[1] 
http://www.postgresql.org/message-id/CAM3SWZTaVFBwtHF87OpNGN2r2_he-wsmN53HmqyWYPM=k51...@mail.gmail.com
-- 
Peter Geoghegan
From 1427af6b0b6ab404898e9de1c74e72e14ac31ae3 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <peter.geoghega...@gmail.com>
Date: Tue, 21 Jul 2015 16:18:25 -0700
Subject: [PATCH 3/3] Add SortSupport routine for UUID data type

This introduces a simple encoding scheme to produce abbreviated keys:
pack as many bytes of each UUID as will fit into a Datum.  On
little-endian machines, a byteswap is also performed; the abbreviated
comparator can therefore just consist of a simple 3-way unsigned integer
comparison.  This comports with regular UUID comparisons, because they
use memcmp() to compare authoritative UUID datums.

Note:  This patch depends on the text unsigned integer comparison patch,
which was independently submitted as infrastructure for another feature
patch that speeds up text sorts.
---
 src/backend/utils/adt/uuid.c    | 187 ++++++++++++++++++++++++++++++++++++++++
 src/include/catalog/pg_amproc.h |   1 +
 src/include/catalog/pg_proc.h   |   2 +
 src/include/utils/builtins.h    |   1 +
 4 files changed, 191 insertions(+)

diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c
index bb76b8d..c86395b 100644
--- a/src/backend/utils/adt/uuid.c
+++ b/src/backend/utils/adt/uuid.c
@@ -14,8 +14,12 @@
 #include "postgres.h"
 
 #include "access/hash.h"
+#include "lib/hyperloglog.h"
 #include "libpq/pqformat.h"
+#include "port/pg_bswap.h"
 #include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/sortsupport.h"
 #include "utils/uuid.h"
 
 /* uuid size in bytes */
@@ -27,8 +31,21 @@ struct pg_uuid_t
 	unsigned char data[UUID_LEN];
 };
 
+/* sortsupport for uuid */
+typedef struct
+{
+	int64		input_count;	/* number of non-null values seen */
+	bool		estimating;		/* true if estimating cardinality */
+
+	hyperLogLogState abbr_card; /* cardinality estimator */
+} uuid_sortsupport_state;
+
 static void string_to_uuid(const char *source, pg_uuid_t *uuid);
 static int	uuid_internal_cmp(const pg_uuid_t *arg1, const pg_uuid_t *arg2);
+static int	uuid_fast_cmp(Datum x, Datum y, SortSupport ssup);
+static int	uuid_cmp_abbrev(Datum x, Datum y, SortSupport ssup);
+static bool	uuid_abbrev_abort(int memtupcount, SortSupport ssup);
+static Datum	uuid_abbrev_convert(Datum original, SortSupport ssup);
 
 Datum
 uuid_in(PG_FUNCTION_ARGS)
@@ -222,6 +239,176 @@ uuid_cmp(PG_FUNCTION_ARGS)
 	PG_RETURN_INT32(uuid_internal_cmp(arg1, arg2));
 }
 
+/*
+ * Sort support strategy routine
+ */
+Datum
+uuid_sortsupport(PG_FUNCTION_ARGS)
+{
+	SortSupport	ssup = (SortSupport) PG_GETARG_POINTER(0);
+
+	ssup->comparator = uuid_fast_cmp;
+	ssup->ssup_extra = NULL;
+
+	if (ssup->abbreviate)
+	{
+		uuid_sortsupport_state	   *uss;
+		MemoryContext				oldcontext;
+
+		oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
+
+		uss = palloc(sizeof(uuid_sortsupport_state));
+		uss->input_count = 0;
+		uss->estimating = true;
+		initHyperLogLog(&uss->abbr_card, 10);
+
+		ssup->ssup_extra = uss;
+
+		ssup->comparator = uuid_cmp_abbrev;
+		ssup->abbrev_converter = uuid_abbrev_convert;
+		ssup->abbrev_abort = uuid_abbrev_abort;
+		ssup->abbrev_full_comparator = uuid_fast_cmp;
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * SortSupport comparison func
+ */
+static int
+uuid_fast_cmp(Datum x, Datum y, SortSupport ssup)
+{
+	pg_uuid_t  *arg1 = DatumGetUUIDP(x);
+	pg_uuid_t  *arg2 = DatumGetUUIDP(y);
+
+	return uuid_internal_cmp(arg1, arg2);
+}
+
+/*
+ * Abbreviated key comparison func
+ */
+static int
+uuid_cmp_abbrev(Datum x, Datum y, SortSupport ssup)
+{
+	if (x > y)
+		return 1;
+	else if (x == y)
+		return 0;
+	else
+		return -1;
+}
+
+/*
+ * Callback for estimating effectiveness of abbreviated key optimization.
+ *
+ * We pay no attention to the cardinality of the non-abbreviated data, because
+ * there is no equality fast-path within authoritative uuid comparator.
+ */
+static bool
+uuid_abbrev_abort(int memtupcount, SortSupport ssup)
+{
+	uuid_sortsupport_state	   *uss = ssup->ssup_extra;
+	double						abbr_card;
+
+	if (memtupcount < 10000 || uss->input_count < 10000 || !uss->estimating)
+		return false;
+
+	abbr_card = estimateHyperLogLog(&uss->abbr_card);
+
+	/*
+	 * If we have >100k distinct values, then even if we were sorting many
+	 * billion rows we'd likely still break even, and the penalty of undoing
+	 * that many rows of abbrevs would probably not be worth it.  Stop even
+	 * counting at that point.
+	 */
+	if (abbr_card > 100000.0)
+	{
+#ifdef TRACE_SORT
+		if (trace_sort)
+			elog(LOG,
+				 "uuid_abbrev: estimation ends at cardinality %f"
+				 " after " INT64_FORMAT " values (%d rows)",
+				 abbr_card, uss->input_count, memtupcount);
+#endif
+		uss->estimating = false;
+		return false;
+	}
+
+	/*
+	 * Target minimum cardinality is 1 per ~2k of non-null inputs.  0.5 row
+	 * fudge factor allows us to abort earlier on genuinely pathological data
+	 * where we've had exactly one abbreviated value in the first 2k (non-null)
+	 * rows.
+	 */
+	if (abbr_card < uss->input_count / 2000.0 + 0.5)
+	{
+#ifdef TRACE_SORT
+		if (trace_sort)
+			elog(LOG,
+				 "uuid_abbrev: aborting abbreviation at cardinality %f"
+				 " below threshold %f after " INT64_FORMAT " values (%d rows)",
+				 abbr_card, uss->input_count / 2000.0 + 0.5, uss->input_count,
+				 memtupcount);
+#endif
+		return true;
+	}
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG,
+			 "uuid_abbrev: cardinality %f after " INT64_FORMAT
+			 " values (%d rows)", abbr_card, uss->input_count, memtupcount);
+#endif
+
+	return false;
+}
+
+/*
+ * Conversion routine for sortsupport.  Converts original uuid representation
+ * to abbreviated key representation.  Our encoding strategy is simple -- pack
+ * the first `sizeof(Datum)` bytes of uuid data into a Datum (on little-endian
+ * machines, the bytes are stored in reverse order), and treat it as an
+ * unsigned integer.
+ */
+static Datum
+uuid_abbrev_convert(Datum original, SortSupport ssup)
+{
+	uuid_sortsupport_state	   *uss = ssup->ssup_extra;
+	pg_uuid_t				   *authoritative = DatumGetUUIDP(original);
+	Datum						res;
+
+	memcpy((char *) &res, authoritative->data, sizeof(Datum));
+	uss->input_count += 1;
+
+	if (uss->estimating)
+	{
+		uint32		tmp;
+
+#if SIZEOF_DATUM == 8
+		tmp = (uint32) res ^ (uint32) ((uint64) res >> 32);
+#else							/* SIZEOF_DATUM != 8 */
+		tmp = (uint32) res;
+#endif
+
+		addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp)));
+	}
+
+	/*
+	 * Byteswap on little-endian machines.
+	 *
+	 * This is needed so that uuid_cmp_abbrev() (an unsigned integer 3-way
+	 * comparator) works correctly on all platforms.  If this was skipped, then
+	 * the comparator would have to call memcmp() with a pair of pointers to
+	 * the first byte of each abbreviated key, which is slower.
+	 */
+	res = DatumToLittleEndian(res);
+
+	return res;
+}
+
 /* hash index support */
 Datum
 uuid_hash(PG_FUNCTION_ARGS)
diff --git a/src/include/catalog/pg_amproc.h b/src/include/catalog/pg_amproc.h
index b57d6e6..7db2015 100644
--- a/src/include/catalog/pg_amproc.h
+++ b/src/include/catalog/pg_amproc.h
@@ -134,6 +134,7 @@ DATA(insert (	2233   703 703 1  380 ));
 DATA(insert (	2234   704 704 1  381 ));
 DATA(insert (	2789   27 27 1 2794 ));
 DATA(insert (	2968   2950 2950 1 2960 ));
+DATA(insert (	2968   2950 2950 2 3300 ));
 DATA(insert (	2994   2249 2249 1 2987 ));
 DATA(insert (	3194   2249 2249 1 3187 ));
 DATA(insert (	3253   3220 3220 1 3251 ));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index eb55b3a..495b801 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4467,6 +4467,8 @@ DATA(insert OID = 2958 (  uuid_gt		   PGNSP PGUID 12 1 0 0 0 f f f t t f i s 2 0
 DATA(insert OID = 2959 (  uuid_ne		   PGNSP PGUID 12 1 0 0 0 f f f t t f i s 2 0 16 "2950 2950" _null_ _null_ _null_ _null_ _null_ uuid_ne _null_ _null_ _null_ ));
 DATA(insert OID = 2960 (  uuid_cmp		   PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 23 "2950 2950" _null_ _null_ _null_ _null_ _null_ uuid_cmp _null_ _null_ _null_ ));
 DESCR("less-equal-greater");
+DATA(insert OID = 3300 (  uuid_sortsupport PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ uuid_sortsupport _null_ _null_ _null_ ));
+DESCR("sort support");
 DATA(insert OID = 2961 (  uuid_recv		   PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2950 "2281" _null_ _null_ _null_ _null_ _null_ uuid_recv _null_ _null_ _null_ ));
 DESCR("I/O");
 DATA(insert OID = 2962 (  uuid_send		   PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 17 "2950" _null_ _null_ _null_ _null_ _null_ uuid_send _null_ _null_ _null_ ));
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index fc1679e..cbfaa0a 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -1173,6 +1173,7 @@ extern Datum uuid_ge(PG_FUNCTION_ARGS);
 extern Datum uuid_gt(PG_FUNCTION_ARGS);
 extern Datum uuid_ne(PG_FUNCTION_ARGS);
 extern Datum uuid_cmp(PG_FUNCTION_ARGS);
+extern Datum uuid_sortsupport(PG_FUNCTION_ARGS);
 extern Datum uuid_hash(PG_FUNCTION_ARGS);
 
 /* windowfuncs.c */
-- 
1.9.1

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to