On Tue, Oct 6, 2015 at 1:15 PM, Robert Haas <robertmh...@gmail.com> wrote: > + tmp = ((uint32) res ^ (uint32) ((uint64) res >> 32)); > > The outer set of parentheses here seems pretty worthless. Perhaps one > of the parentheses at the end of the statement should be moved to just > after "res". That seems like it would add considerably clarity.
This is more or less lifted from numeric_abbrev_convert_var(). Perhaps you should change it there too. The extra set of parenthesis are removed in the attached patch. The patch also mechanically updates things to be consistent with the text changes on the text thread [1] -- I had to rebase. >> This patch is not all that exciting -- the techniques used here are >> very simple, and it's all familiar territory for us. I expect that >> this patch will not be at all contentious when someone eventually gets >> around to reviewing it. > > There is no doubt that we need more reviewers. I'm trying to do review following my burst of productivity on sorting (especially external sorting) -- I should manage to do a bunch of patch review when I return from vacation in about a month (I leave in a couple of weeks). I'm currently privately helping a new contributor with a large project in its early stages, so that is something. Perhaps you'll hear more about that before too long. [1] http://www.postgresql.org/message-id/CAM3SWZTaVFBwtHF87OpNGN2r2_he-wsmN53HmqyWYPM=k51...@mail.gmail.com -- Peter Geoghegan
From 1427af6b0b6ab404898e9de1c74e72e14ac31ae3 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan <peter.geoghega...@gmail.com> Date: Tue, 21 Jul 2015 16:18:25 -0700 Subject: [PATCH 3/3] Add SortSupport routine for UUID data type This introduces a simple encoding scheme to produce abbreviated keys: pack as many bytes of each UUID as will fit into a Datum. On little-endian machines, a byteswap is also performed; the abbreviated comparator can therefore just consist of a simple 3-way unsigned integer comparison. This comports with regular UUID comparisons, because they use memcmp() to compare authoritative UUID datums. Note: This patch depends on the text unsigned integer comparison patch, which was independently submitted as infrastructure for another feature patch that speeds up text sorts. --- src/backend/utils/adt/uuid.c | 187 ++++++++++++++++++++++++++++++++++++++++ src/include/catalog/pg_amproc.h | 1 + src/include/catalog/pg_proc.h | 2 + src/include/utils/builtins.h | 1 + 4 files changed, 191 insertions(+) diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c index bb76b8d..c86395b 100644 --- a/src/backend/utils/adt/uuid.c +++ b/src/backend/utils/adt/uuid.c @@ -14,8 +14,12 @@ #include "postgres.h" #include "access/hash.h" +#include "lib/hyperloglog.h" #include "libpq/pqformat.h" +#include "port/pg_bswap.h" #include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/sortsupport.h" #include "utils/uuid.h" /* uuid size in bytes */ @@ -27,8 +31,21 @@ struct pg_uuid_t unsigned char data[UUID_LEN]; }; +/* sortsupport for uuid */ +typedef struct +{ + int64 input_count; /* number of non-null values seen */ + bool estimating; /* true if estimating cardinality */ + + hyperLogLogState abbr_card; /* cardinality estimator */ +} uuid_sortsupport_state; + static void string_to_uuid(const char *source, pg_uuid_t *uuid); static int uuid_internal_cmp(const pg_uuid_t *arg1, const pg_uuid_t *arg2); +static int uuid_fast_cmp(Datum x, Datum y, SortSupport ssup); +static int uuid_cmp_abbrev(Datum x, Datum y, SortSupport ssup); +static bool uuid_abbrev_abort(int memtupcount, SortSupport ssup); +static Datum uuid_abbrev_convert(Datum original, SortSupport ssup); Datum uuid_in(PG_FUNCTION_ARGS) @@ -222,6 +239,176 @@ uuid_cmp(PG_FUNCTION_ARGS) PG_RETURN_INT32(uuid_internal_cmp(arg1, arg2)); } +/* + * Sort support strategy routine + */ +Datum +uuid_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = uuid_fast_cmp; + ssup->ssup_extra = NULL; + + if (ssup->abbreviate) + { + uuid_sortsupport_state *uss; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); + + uss = palloc(sizeof(uuid_sortsupport_state)); + uss->input_count = 0; + uss->estimating = true; + initHyperLogLog(&uss->abbr_card, 10); + + ssup->ssup_extra = uss; + + ssup->comparator = uuid_cmp_abbrev; + ssup->abbrev_converter = uuid_abbrev_convert; + ssup->abbrev_abort = uuid_abbrev_abort; + ssup->abbrev_full_comparator = uuid_fast_cmp; + + MemoryContextSwitchTo(oldcontext); + } + + PG_RETURN_VOID(); +} + +/* + * SortSupport comparison func + */ +static int +uuid_fast_cmp(Datum x, Datum y, SortSupport ssup) +{ + pg_uuid_t *arg1 = DatumGetUUIDP(x); + pg_uuid_t *arg2 = DatumGetUUIDP(y); + + return uuid_internal_cmp(arg1, arg2); +} + +/* + * Abbreviated key comparison func + */ +static int +uuid_cmp_abbrev(Datum x, Datum y, SortSupport ssup) +{ + if (x > y) + return 1; + else if (x == y) + return 0; + else + return -1; +} + +/* + * Callback for estimating effectiveness of abbreviated key optimization. + * + * We pay no attention to the cardinality of the non-abbreviated data, because + * there is no equality fast-path within authoritative uuid comparator. + */ +static bool +uuid_abbrev_abort(int memtupcount, SortSupport ssup) +{ + uuid_sortsupport_state *uss = ssup->ssup_extra; + double abbr_card; + + if (memtupcount < 10000 || uss->input_count < 10000 || !uss->estimating) + return false; + + abbr_card = estimateHyperLogLog(&uss->abbr_card); + + /* + * If we have >100k distinct values, then even if we were sorting many + * billion rows we'd likely still break even, and the penalty of undoing + * that many rows of abbrevs would probably not be worth it. Stop even + * counting at that point. + */ + if (abbr_card > 100000.0) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "uuid_abbrev: estimation ends at cardinality %f" + " after " INT64_FORMAT " values (%d rows)", + abbr_card, uss->input_count, memtupcount); +#endif + uss->estimating = false; + return false; + } + + /* + * Target minimum cardinality is 1 per ~2k of non-null inputs. 0.5 row + * fudge factor allows us to abort earlier on genuinely pathological data + * where we've had exactly one abbreviated value in the first 2k (non-null) + * rows. + */ + if (abbr_card < uss->input_count / 2000.0 + 0.5) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "uuid_abbrev: aborting abbreviation at cardinality %f" + " below threshold %f after " INT64_FORMAT " values (%d rows)", + abbr_card, uss->input_count / 2000.0 + 0.5, uss->input_count, + memtupcount); +#endif + return true; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "uuid_abbrev: cardinality %f after " INT64_FORMAT + " values (%d rows)", abbr_card, uss->input_count, memtupcount); +#endif + + return false; +} + +/* + * Conversion routine for sortsupport. Converts original uuid representation + * to abbreviated key representation. Our encoding strategy is simple -- pack + * the first `sizeof(Datum)` bytes of uuid data into a Datum (on little-endian + * machines, the bytes are stored in reverse order), and treat it as an + * unsigned integer. + */ +static Datum +uuid_abbrev_convert(Datum original, SortSupport ssup) +{ + uuid_sortsupport_state *uss = ssup->ssup_extra; + pg_uuid_t *authoritative = DatumGetUUIDP(original); + Datum res; + + memcpy((char *) &res, authoritative->data, sizeof(Datum)); + uss->input_count += 1; + + if (uss->estimating) + { + uint32 tmp; + +#if SIZEOF_DATUM == 8 + tmp = (uint32) res ^ (uint32) ((uint64) res >> 32); +#else /* SIZEOF_DATUM != 8 */ + tmp = (uint32) res; +#endif + + addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + } + + /* + * Byteswap on little-endian machines. + * + * This is needed so that uuid_cmp_abbrev() (an unsigned integer 3-way + * comparator) works correctly on all platforms. If this was skipped, then + * the comparator would have to call memcmp() with a pair of pointers to + * the first byte of each abbreviated key, which is slower. + */ + res = DatumToLittleEndian(res); + + return res; +} + /* hash index support */ Datum uuid_hash(PG_FUNCTION_ARGS) diff --git a/src/include/catalog/pg_amproc.h b/src/include/catalog/pg_amproc.h index b57d6e6..7db2015 100644 --- a/src/include/catalog/pg_amproc.h +++ b/src/include/catalog/pg_amproc.h @@ -134,6 +134,7 @@ DATA(insert ( 2233 703 703 1 380 )); DATA(insert ( 2234 704 704 1 381 )); DATA(insert ( 2789 27 27 1 2794 )); DATA(insert ( 2968 2950 2950 1 2960 )); +DATA(insert ( 2968 2950 2950 2 3300 )); DATA(insert ( 2994 2249 2249 1 2987 )); DATA(insert ( 3194 2249 2249 1 3187 )); DATA(insert ( 3253 3220 3220 1 3251 )); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index eb55b3a..495b801 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4467,6 +4467,8 @@ DATA(insert OID = 2958 ( uuid_gt PGNSP PGUID 12 1 0 0 0 f f f t t f i s 2 0 DATA(insert OID = 2959 ( uuid_ne PGNSP PGUID 12 1 0 0 0 f f f t t f i s 2 0 16 "2950 2950" _null_ _null_ _null_ _null_ _null_ uuid_ne _null_ _null_ _null_ )); DATA(insert OID = 2960 ( uuid_cmp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 23 "2950 2950" _null_ _null_ _null_ _null_ _null_ uuid_cmp _null_ _null_ _null_ )); DESCR("less-equal-greater"); +DATA(insert OID = 3300 ( uuid_sortsupport PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ uuid_sortsupport _null_ _null_ _null_ )); +DESCR("sort support"); DATA(insert OID = 2961 ( uuid_recv PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2950 "2281" _null_ _null_ _null_ _null_ _null_ uuid_recv _null_ _null_ _null_ )); DESCR("I/O"); DATA(insert OID = 2962 ( uuid_send PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 17 "2950" _null_ _null_ _null_ _null_ _null_ uuid_send _null_ _null_ _null_ )); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index fc1679e..cbfaa0a 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -1173,6 +1173,7 @@ extern Datum uuid_ge(PG_FUNCTION_ARGS); extern Datum uuid_gt(PG_FUNCTION_ARGS); extern Datum uuid_ne(PG_FUNCTION_ARGS); extern Datum uuid_cmp(PG_FUNCTION_ARGS); +extern Datum uuid_sortsupport(PG_FUNCTION_ARGS); extern Datum uuid_hash(PG_FUNCTION_ARGS); /* windowfuncs.c */ -- 1.9.1
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers