On Fri, Aug 22, 2014 at 7:19 AM, Robert Haas <robertmh...@gmail.com> wrote: > Patch 0002 no longer applies; please rebase.
I attach rebased patch. Note that there is currently a bug in the master branch: + if (len2 >= tss->buflen2) + { + pfree(tss->buf2); + tss->buflen1 = Max(len2 + 1, Min(tss->buflen2 * 2, MaxAllocSize)); + tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2); + } Thanks -- Peter Geoghegan
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c new file mode 100644 index c09ca7e..adf4b8c *** a/src/backend/commands/analyze.c --- b/src/backend/commands/analyze.c *************** compute_scalar_stats(VacAttrStatsP stats *** 2292,2297 **** --- 2292,2303 ---- /* We always use the default collation for statistics */ ssup.ssup_collation = DEFAULT_COLLATION_OID; ssup.ssup_nulls_first = false; + /* + * For now, don't perform abbreviated key conversion, because full values + * are required for MCV slot generation. Supporting that optimization + * would necessitate teaching compare_scalars() to call a tie-breaker. + */ + ssup.position = sortKeyOther; PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup); diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c new file mode 100644 index 510d1c5..3b335e7 *** a/src/backend/executor/nodeAgg.c --- b/src/backend/executor/nodeAgg.c *************** initialize_aggregates(AggState *aggstate *** 346,351 **** --- 346,352 ---- { AggStatePerAgg peraggstate = &peragg[aggno]; AggStatePerGroup pergroupstate = &pergroup[aggno]; + Agg *node = (Agg *) aggstate->ss.ps.plan; /* * Start a fresh sort operation for each DISTINCT/ORDER BY aggregate. *************** initialize_aggregates(AggState *aggstate *** 363,368 **** --- 364,373 ---- * We use a plain Datum sorter when there's a single input column; * otherwise sort the full tuple. (See comments for * process_ordered_aggregate_single.) + * + * FIXME: It ought to be useful to force tuplesort_begin_heap() + * case where the abbreviated key optimization can thereby be used, + * even when numInputs == 1. */ peraggstate->sortstate = (peraggstate->numInputs == 1) ? *************** initialize_aggregates(AggState *aggstate *** 377,383 **** peraggstate->sortOperators, peraggstate->sortCollations, peraggstate->sortNullsFirst, ! work_mem, false); } /* --- 382,390 ---- peraggstate->sortOperators, peraggstate->sortCollations, peraggstate->sortNullsFirst, ! work_mem, ! node->plan.plan_rows, ! false); } /* diff --git a/src/backend/executor/nodeMergeAppend.c b/src/backend/executor/nodeMergeAppend.c new file mode 100644 index 47ed068..677a953 *** a/src/backend/executor/nodeMergeAppend.c --- b/src/backend/executor/nodeMergeAppend.c *************** ExecInitMergeAppend(MergeAppend *node, E *** 137,142 **** --- 137,151 ---- sortKey->ssup_nulls_first = node->nullsFirst[i]; sortKey->ssup_attno = node->sortColIdx[i]; + /* + * It isn't feasible to perform abbreviated key conversion, since + * tuples are pulled into mergestate's binary heap as needed. It would + * likely be counter-productive to convert tuples into an abbreviated + * representation as they're pulled up, so opt out of that additional + * optimization entirely. + */ + sortKey->position = sortKeyOther; + PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey); } diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c new file mode 100644 index fdf2f4c..39f0ff1 *** a/src/backend/executor/nodeMergejoin.c --- b/src/backend/executor/nodeMergejoin.c *************** MJExamineQuals(List *mergeclauses, *** 229,234 **** --- 229,242 ---- elog(ERROR, "cannot merge using non-equality operator %u", qual->opno); + /* + * sortsupport routine must know if abbreviation optimization is + * applicable in principle. It is never applicable for merge joins + * because there is no convenient opportunity to convert to alternative + * representation. Only elide fmgr trampoline where supported. + */ + clause->ssup.position = sortKeyOther; + /* And get the matching support or comparison function */ Assert(clause->ssup.comparator == NULL); sortfunc = get_opfamily_proc(opfamily, diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c new file mode 100644 index b88571b..31d3ead *** a/src/backend/executor/nodeSort.c --- b/src/backend/executor/nodeSort.c *************** ExecSort(SortState *node) *** 89,94 **** --- 89,95 ---- plannode->collations, plannode->nullsFirst, work_mem, + plannode->plan.plan_rows, node->randomAccess); if (node->bounded) tuplesort_set_bound(tuplesortstate, node->bound); diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile new file mode 100644 index 327a1bc..14f9a46 *** a/src/backend/lib/Makefile --- b/src/backend/lib/Makefile *************** subdir = src/backend/lib *** 12,17 **** top_builddir = ../../.. include $(top_builddir)/src/Makefile.global ! OBJS = ilist.o binaryheap.o stringinfo.o include $(top_srcdir)/src/backend/common.mk --- 12,17 ---- top_builddir = ../../.. include $(top_builddir)/src/Makefile.global ! OBJS = ilist.o binaryheap.o hyperloglog.o stringinfo.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c new file mode 100644 index ...4db42bf *** a/src/backend/lib/hyperloglog.c --- b/src/backend/lib/hyperloglog.c *************** *** 0 **** --- 1,228 ---- + /*------------------------------------------------------------------------- + * + * hyperloglog.c + * HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014, PostgreSQL Global Development Group + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright term's of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + + /* + * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + + #include "postgres.h" + + #include <math.h> + + #include "lib/hyperloglog.h" + + #define POW_2_32 (4294967296.0) + #define NEG_POW_2_32 (-4294967296.0) + + static inline uint8 rho(uint32 x, uint8 b); + + /* + * Initialize HyperLogLog track state + * + * bwidth is bit width (so register size will be 2 to the power of bwidth). + * Must be between 4 and 16 inclusive. + */ + void + initHyperLogLog(hyperLogLogState *cState, uint8 bwidth) + { + double alpha; + + if (bwidth < 4 || bwidth > 16) + elog(ERROR, "bit width must be between 4 and 16 inclusive"); + + cState->registerWidth = bwidth; + cState->nRegisters = 1 << bwidth; + cState->arrSize = sizeof(uint8) * cState->nRegisters + 1; + + /* + * Initialize hashes array to zero, not negative infinity, per discussion + * of the coupon collector problem in the HyperLogLog paper + */ + cState->hashesArr = palloc0(cState->arrSize); + + /* + * "alpha" is a value that for each possible number of registers (m) is + * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z + * is "the indicator function" through which we finally compute E, + * estimated cardinality). + */ + switch (cState->nRegisters) + { + case 16: + alpha = 0.673; + break; + case 32: + alpha = 0.697; + break; + case 64: + alpha = 0.709; + break; + default: + alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters); + } + + /* + * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog + * estimate E + */ + cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters; + } + + /* + * Adds element to the estimator, from caller-supplied hash. + * + * It is critical that the hash value passed be an actual hash value, typically + * generated using hash_any(). The algorithm relies on a specific bit-pattern + * observable in conjunction with stochastic averaging. There must be a + * uniform distribution of bits in hash values for each distinct original value + * observed. + */ + void + addHyperLogLog(hyperLogLogState *cState, uint32 hash) + { + uint8 count; + uint32 index; + + /* Use the first "k" (registerWidth) bits as a zero based index */ + index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth); + + /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ + count = rho(hash << cState->registerWidth, + BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth); + + cState->hashesArr[index] = Max(count, cState->hashesArr[index]); + } + + /* + * Estimates cardinality, based on elements added so far + */ + double + estimateHyperLogLog(hyperLogLogState *cState) + { + double result; + double sum = 0.0; + int i; + + for (i = 0; i < cState->nRegisters; i++) + { + sum += 1.0 / pow(2.0, cState->hashesArr[i]); + } + + /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */ + result = cState->alphaMM / sum; + + if (result <= (5.0 / 2.0) * cState->nRegisters) + { + /* Small range correction */ + int zero_count = 0; + + for (i = 0; i < cState->nRegisters; i++) + { + if (cState->hashesArr[i] == 0) + zero_count++; + } + + if (zero_count != 0) + result = cState->nRegisters * log((double) cState->nRegisters / + zero_count); + } + else if (result > (1.0 / 30.0) * POW_2_32) + { + /* Large range correction */ + result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32)); + } + + return result; + } + + /* + * Merges the estimate from one HyperLogLog state to another, returning the + * estimate of their union. + * + * The number of registers in each must match. + */ + void + mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState) + { + int r; + + if (cState->nRegisters != oState->nRegisters) + elog(ERROR, "number of registers mismatch: %zu != %zu", + cState->nRegisters, oState->nRegisters); + + for (r = 0; r < cState->nRegisters; ++r) + { + cState->hashesArr[r] = Max(cState->hashesArr[r], oState->hashesArr[r]); + } + } + + + /* + * Worker for addHyperLogLog(). + * + * Calculates the position of the first set bit in first b bits of x argument + * starting from the first, reading from most significant to least significant + * bits. + * + * Example (when considering fist 10 bits of x): + * + * rho(x = 0b1000000000) returns 1 + * rho(x = 0b0010000000) returns 3 + * rho(x = 0b0000000000) returns b + 1 + * + * "The binary address determined by the first b bits of x" + * + * Return value "j" used to index bit pattern to watch. + */ + static inline uint8 + rho(uint32 x, uint8 b) + { + uint8 j = 1; + + while (j <= b && !(x & 0x80000000)) + { + j++; + x <<= 1; + } + + return j; + } diff --git a/src/backend/utils/adt/orderedsetaggs.c b/src/backend/utils/adt/orderedsetaggs.c new file mode 100644 index 135a14b..0a36c84 *** a/src/backend/utils/adt/orderedsetaggs.c --- b/src/backend/utils/adt/orderedsetaggs.c *************** ordered_set_startup(FunctionCallInfo fci *** 274,280 **** qstate->sortOperators, qstate->sortCollations, qstate->sortNullsFirsts, ! work_mem, false); else osastate->sortstate = tuplesort_begin_datum(qstate->sortColType, qstate->sortOperator, --- 274,280 ---- qstate->sortOperators, qstate->sortCollations, qstate->sortNullsFirsts, ! work_mem, -1, false); else osastate->sortstate = tuplesort_begin_datum(qstate->sortColType, qstate->sortOperator, diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c new file mode 100644 index 1bd8abb..96df4a9 *** a/src/backend/utils/adt/varlena.c --- b/src/backend/utils/adt/varlena.c *************** *** 17,25 **** --- 17,27 ---- #include <ctype.h> #include <limits.h> + #include "access/hash.h" #include "access/tuptoaster.h" #include "catalog/pg_collation.h" #include "catalog/pg_type.h" + #include "lib/hyperloglog.h" #include "libpq/md5.h" #include "libpq/pqformat.h" #include "miscadmin.h" *************** *** 32,37 **** --- 34,42 ---- #include "utils/pg_locale.h" #include "utils/sortsupport.h" + #ifdef DEBUG_ABBREV_KEYS + #define DEBUG_elog_output DEBUG1 + #endif /* GUC variable */ int bytea_output = BYTEA_OUTPUT_HEX; *************** typedef struct *** 54,63 **** typedef struct { ! char *buf1; /* 1st string */ ! char *buf2; /* 2nd string */ int buflen1; int buflen2; #ifdef HAVE_LOCALE_T pg_locale_t locale; #endif --- 59,70 ---- typedef struct { ! char *buf1; /* 1st string, or abbreviation original string buf */ ! char *buf2; /* 2nd string, or abbreviation strxfrm() buf */ int buflen1; int buflen2; + hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ + hyperLogLogState full_card; /* Full key cardinality state */ #ifdef HAVE_LOCALE_T pg_locale_t locale; #endif *************** typedef struct *** 75,83 **** --- 82,98 ---- #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n)) #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x) + /* + * Used for calculating number of sort comparisons + */ + #define LOG2(x) (log(x) / 0.693147180559945) + static void btsortsupport_worker(SortSupport ssup, Oid collid); + static int bttextcmp_abbreviated(Datum x, Datum y, SortSupport ssup); static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup); static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup); + static Datum bttext_convert(Datum original, SortSupport ssup); + static bool bttext_abort(int memtupcount, double rowhint, SortSupport ssup); static int32 text_length(Datum str); static text *text_catenate(text *t1, text *t2); static text *text_substring(Datum str, *************** btsortsupport_worker(SortSupport ssup, O *** 1725,1750 **** TextSortSupport *tss; /* - * If LC_COLLATE = C, we can make things quite a bit faster by using - * memcmp() rather than strcoll(). To minimize the per-comparison - * overhead, we make this decision just once for the whole sort. - */ - if (lc_collate_is_c(collid)) - { - ssup->comparator = bttextfastcmp_c; - return; - } - - /* * WIN32 requires complex hacks when the database encoding is UTF-8 (except * when using the "C" collation). For now, we don't optimize that case. */ #ifdef WIN32 ! if (GetDatabaseEncoding() == PG_UTF8) return; #endif /* * We may need a collation-sensitive comparison. To make things faster, * we'll figure out the collation based on the locale id and cache the * result. Also, since strxfrm()/strcoll() require NUL-terminated inputs, --- 1740,1811 ---- TextSortSupport *tss; /* * WIN32 requires complex hacks when the database encoding is UTF-8 (except * when using the "C" collation). For now, we don't optimize that case. */ #ifdef WIN32 ! if (GetDatabaseEncoding() == PG_UTF8 && !lc_collate_is_c(collid)) return; #endif /* + * On platforms where the abbreviated key for text optimization might have + * bad worst case performance it is avoided entirely. In order to ever + * apply the optimization we require: + * + * * That the platform's strxfrm() meets a certain standard for + * representing as much information as possible in leading bytes. + * + * * That there are a full 8 bytes of storage per Datum on the platform, + * since we pack bytes into that representation. Having only 4 bytes + * could make worse case performance drastically more likely. + * + * The standard applied for strxfrm() is that a significant amount of + * entropy from the original string must be concentrated at the beginning + * of returned blobs. The Mac OS X implementation is known to produce + * blobs with very little entropy; it produces printable blobs with a + * series of 24-bit weights, each one encoded into four bytes. Multiple + * "levels" of weights are separated by a weight of 0. Typically only two + * ASCII characters are represented in the first eight bytes. Therefore, + * the optimization is specially disabled. + * + * Any reasonable implementation will pack primary weights into the start + * of returned blobs. The canonical algorithm's implementation is + * discussed by Unicode Technical Standard #10 ("UNICODE COLLATION + * ALGORITHM"), section 4, "Main algorithm". Section 4.3, "Form Sort Key" + * is of particular interest: + * + * http://www.unicode.org/reports/tr10/#Step_3 + * + * The collation algorithm standard goes on to state: + * + * "By default, the algorithm makes use of three fully-customizable levels. + * For the Latin script, these levels correspond roughly to: + * + * alphabetic ordering + * + * diacritic ordering + * + * case ordering. + * + * A final level may be used for tie-breaking between strings not otherwise + * distinguished." + * + * It is generally expected that most non-equal keys will have their + * comparisons resolved at the primary level. If enough comparisons can be + * resolved with just 8 byte abbreviated keys, this optimization is very + * effective (although if there are many tie-breakers that largely only + * perform cheap memcmp() calls, that is also much faster than the + * unoptimized case - see bttext_abort()). + * + * There is no reason to not at least perform fmgr elision on platforms + * where abbreviation isn't expected to be profitable, though. + */ + #if defined(__darwin__) || SIZEOF_DATUM != 8 + ssup->type = sortKeyOther; + #endif + + /* * We may need a collation-sensitive comparison. To make things faster, * we'll figure out the collation based on the locale id and cache the * result. Also, since strxfrm()/strcoll() require NUL-terminated inputs, *************** btsortsupport_worker(SortSupport ssup, O *** 1777,1789 **** #endif } tss->buf1 = palloc(TEXTBUFLEN); tss->buflen1 = TEXTBUFLEN; tss->buf2 = palloc(TEXTBUFLEN); tss->buflen2 = TEXTBUFLEN; ssup->ssup_extra = tss; ! ssup->comparator = bttextfastcmp_locale; } /* --- 1838,1922 ---- #endif } + if (ssup->position != sortKeyAbbreviated) + { + /* + * If LC_COLLATE = C, we can make things quite a bit faster by using + * memcmp() rather than strcoll(). To minimize the per-comparison + * overhead, we make this decision just once for the whole sort. + */ + if (lc_collate_is_c(collid)) + { + ssup->comparator = bttextfastcmp_c; + } + else + { + ssup->comparator = bttextfastcmp_locale; + + /* locale case requires temp buffers */ + tss->buf1 = palloc(TEXTBUFLEN); + tss->buflen1 = TEXTBUFLEN; + tss->buf2 = palloc(TEXTBUFLEN); + tss->buflen2 = TEXTBUFLEN; + ssup->ssup_extra = tss; + } + + return; + } + + /* + * Abbreviated case requires temp buffers for strxfrm() copying, and in the + * *_locale case possibly for eventual tie-breaker comparisons during + * sorting proper + */ tss->buf1 = palloc(TEXTBUFLEN); tss->buflen1 = TEXTBUFLEN; tss->buf2 = palloc(TEXTBUFLEN); tss->buflen2 = TEXTBUFLEN; ssup->ssup_extra = tss; ! ! initHyperLogLog(&tss->abbr_card, 10); ! initHyperLogLog(&tss->full_card, 10); ! ! ssup->comparator = bttextcmp_abbreviated; ! ssup->converter = bttext_convert; ! ssup->abort_conversion = bttext_abort; ! ! ssup->tiebreak = palloc0(sizeof(SortSupportData)); ! ssup->tiebreak->ssup_cxt = ssup->ssup_cxt; ! ssup->tiebreak->ssup_collation = ssup->ssup_collation; ! ssup->tiebreak->ssup_reverse = ssup->ssup_reverse; ! ssup->tiebreak->ssup_nulls_first = ssup->ssup_nulls_first; ! ssup->tiebreak->ssup_attno = ssup->ssup_attno; ! ! /* ! * Initialize tiebreak sortsupport state with an authoritative ! * strcoll()-based comparison func for tie-breaking. ! */ ! ssup->tiebreak->position = sortKeyTiebreak; ! btsortsupport_worker(ssup->tiebreak, collid); ! } ! ! /* ! * Abbreviated key comparison func ! */ ! static int ! bttextcmp_abbreviated(Datum x, Datum y, SortSupport ssup) ! { ! char *a = (char *) &x; ! char *b = (char *) &y; ! int result; ! ! result = memcmp(a, b, sizeof(Datum)); ! ! /* ! * When result = 0, the core system will call bttextfastcmp_c() or ! * bttextfastcmp_locale(). Even a strcmp() on two non-truncated strxfrm() ! * blobs cannot indicate *equality* authoritatively, for the same reason ! * that there is a strcoll() strcmp() tie-breaker in varstr_cmp(). ! */ ! return result; } /* *************** bttextfastcmp_locale(Datum x, Datum y, S *** 1842,1847 **** --- 1975,2008 ---- len1 = VARSIZE_ANY_EXHDR(arg1); len2 = VARSIZE_ANY_EXHDR(arg2); + if (ssup->position == sortKeyTiebreak && len1 == len2) + { + /* + * Being called as authoritative tie-breaker for an abbreviated key + * comparison. + * + * TODO: Consider using this optimization more frequently, perhaps + * even entirely opportunistically. + * + * In general there is a pretty good chance control reached here + * because the two keys are actually fully equal. Try and give an + * answer using only a cheap memcmp() comparison on the assumption that + * this will indicate equality frequently enough for it to be worth it + * on balance. This is a reasonable assumption, since sorting is + * almost certainly bottlenecked on memory latency. + * + * Original, authoritative key cardinality is weighed within + * bttext_abort(). Cases where attempts at resolving tie-breakers in + * this manner are the usual outcome, and yet those attempts usually + * fail should only arise infrequently. + */ + if (memcmp(a1p, a2p, len1) == 0) + { + result = 0; + goto done; + } + } + if (len1 >= tss->buflen1) { pfree(tss->buf1); *************** bttextfastcmp_locale(Datum x, Datum y, S *** 1851,1857 **** if (len2 >= tss->buflen2) { pfree(tss->buf2); ! tss->buflen1 = Max(len2 + 1, Min(tss->buflen2 * 2, MaxAllocSize)); tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2); } --- 2012,2018 ---- if (len2 >= tss->buflen2) { pfree(tss->buf2); ! tss->buflen2 = Max(len2 + 1, Min(tss->buflen2 * 2, MaxAllocSize)); tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2); } *************** bttextfastcmp_locale(Datum x, Datum y, S *** 1875,1880 **** --- 2036,2043 ---- if (result == 0) result = strcmp(tss->buf1, tss->buf2); + done: + /* We can't afford to leak memory here. */ if (PointerGetDatum(arg1) != x) pfree(arg1); *************** bttextfastcmp_locale(Datum x, Datum y, S *** 1884,1889 **** --- 2047,2324 ---- return result; } + /* + * Conversion routine for sortsupport. Converts original text to abbreviated + * key representation. Our encoding strategy is simple -- pack the first 8 + * bytes of a strxfrm() blob into a Datum. + */ + static Datum + bttext_convert(Datum original, SortSupport ssup) + { + TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra; + text *authoritative = DatumGetTextPP(original); + + /* working state */ + Datum res; + char *pres; + int len; + Size bsize; + uint32 lohalf, + hihalf, + hash; + + /* + * Abbreviated key representation is a pass-by-value Datum that is treated + * as a char array by the specialized comparator bttextcmp_abbreviated(). + */ + pres = (char *) &res; + /* memset() so untouched bytes are NULL */ + memset(pres, 0, sizeof(Datum)); + len = VARSIZE_ANY_EXHDR(authoritative); + + /* By convention, we use buffer 1 to store and NULL terminate text */ + if (len >= tss->buflen1) + { + pfree(tss->buf1); + tss->buflen1 = Max(len + 1, Min(tss->buflen1 * 2, MaxAllocSize)); + tss->buf1 = palloc(tss->buflen1); + } + + /* Just like strcoll(), strxfrm() expects a NULL-terminated string */ + memcpy(tss->buf1, VARDATA_ANY(authoritative), len); + tss->buf1[len] = '\0'; + + /* Don't leak memory here */ + if (PointerGetDatum(authoritative) != original) + pfree(authoritative); + + retry: + + /* + * There is no special handling of the C locale here. strxfrm() is used + * indifferently. + */ + #ifdef HAVE_LOCALE_T + if (tss->locale) + bsize = strxfrm_l(tss->buf2, tss->buf1, tss->buflen2, tss->locale); + else + #endif + bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2); + + if (bsize >= tss->buflen2) + { + /* + * The C standard states that the contents of the buffer is now + * unspecified. Grow buffer, and retry. + */ + pfree(tss->buf2); + tss->buflen2 = Max(bsize + 1, Min(tss->buflen2 * 2, MaxAllocSize)); + tss->buf2 = palloc(tss->buflen2); + goto retry; + } + + /* + * Maintain approximate cardinality of both abbreviated keys and original, + * authoritative keys using HyperLogLog. Used as cheap insurance against + * the worst case, where we do many string transformations for no saving in + * full strcoll()-based comparisons. These statistics are used by + * bttext_abort(). + * + * First, Hash key proper, or a significant fraction of it. Mix in length + * in order to compensate for cases where differences are past + * CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing. + */ + hash = hash_any((unsigned char *) tss->buf1, Min(len, CACHE_LINE_SIZE)); + + if (len > CACHE_LINE_SIZE) + hash ^= DatumGetUInt32(hash_uint32((uint32) len)); + + addHyperLogLog(&tss->full_card, hash); + + memcpy(pres, tss->buf2, Min(sizeof(Datum), bsize)); + + /* Hash abbreviated key */ + lohalf = (uint32) res; + hihalf = (uint32) (res >> 32); + hash = hash_uint32(lohalf ^ hihalf); + + addHyperLogLog(&tss->abbr_card, hash); + + /* + * Every Datum byte is compared. This is safe because the strxfrm() blob + * is itself NULL-terminated, leaving no danger of misinterpreting any NULL + * bytes not intended to be interpreted as logically representing + * termination. + */ + return res; + } + + /* + * Callback for estimating effectiveness of abbreviated key optimization, using + * heuristic rules. Returns value indicating if the abbreviation optimization + * should be aborted, based on its projected effectiveness. + */ + static bool + bttext_abort(int memtupcount, double rowhint, SortSupport ssup) + { + TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra; + double abbrev_distinct, + key_distinct, + norm_key_card; + + Assert(ssup->position == sortKeyAbbreviated); + + /* Have a little patience, even without a rowhint */ + if (memtupcount < 20) + return false; + + if (rowhint > 0) + { + double normalized_rows_to_process, + estimated_cmps; + + normalized_rows_to_process = (rowhint - memtupcount) / rowhint; + + if (normalized_rows_to_process > 0.95 && memtupcount < 200000) + { + /* + * Be patient -- don't consider aborting until we've processed an + * estimated 5% of all rows to be sorted, or 200,000 rows, + * whichever is less. + */ + #ifdef DEBUG_ABBREV_KEYS + elog(DEBUG_elog_output, "conversion patiently waited after %d tuples of %f", + memtupcount, rowhint); + #endif + return false; + } + else if (normalized_rows_to_process < 0.65) + { + /* Already too invested -- don't abort a marginal case */ + return false; + } + + /* + * strxfrm() is strongly recommended for large lists of strings. This + * is because despite the memory overhead often implied by an approach + * using string transformation, the number of comparisons that a + * comparison sort algorithm requires increases at least in proportion + * to O(n log n). Linearithmic growth will result in a number of + * comparisons that is considerably higher than the number of elements. + * (top-N heapsorts never use the abbreviation optimization, and so are + * not considered here.) + * + * Unicode Technical Standard #10 states "Because binary comparison is + * much faster than string comparison, it is faster to use sort keys + * whenever there will be more than about 10 comparisons per string, if + * the system can afford the storage". That would amount to + * approximately 1,000 list elements on average. While our costs are + * clearly different in several ways, this calculus cannot be ignored + * entirely. Past a certain point, we are probabilistically better off + * holding out for some improvement even if there is an abbreviated key + * cardinality of 1 thus far. That point is somewhat arbitrarily + * assumed to be 20 comparisons per string (approximately 1 million + * estimated rows). We may still lose, but not by terribly much, and + * only in cases close to the most pessimal worst case. Even in that + * very worst case, as this tuple count threshold is crossed the + * regression for internal sorts is at or under 5%. This is helped by + * fmgr elision, and not hurt much by useless abbreviated comparisons. + * (a strxfrm() is expensive, a memcmp() of a Datum already in L1 cache + * much less so.) + */ + estimated_cmps = rowhint * LOG2(rowhint); + + if (estimated_cmps > rowhint * 20) + { + #ifdef DEBUG_ABBREV_KEYS + elog(DEBUG_elog_output, "row estimate too high (%f, estimated cmps: %f) to ever abort", + rowhint, estimated_cmps); + #endif + return false; + } + } + + abbrev_distinct = estimateHyperLogLog(&tss->abbr_card); + key_distinct = estimateHyperLogLog(&tss->full_card); + + /* + * Clamp cardinality estimates to at least one distinct value. While NULLs + * are generally disregarded, if only NULL values were seen so far, that + * might misrepresent costs if we failed to clamp. + */ + if (abbrev_distinct <= 1.0) + abbrev_distinct = 1.0; + + if (key_distinct <= 1.0) + key_distinct = 1.0; + + /* + * In the worst case all abbreviated keys are identical, while at the same + * time there are differences within full key strings not captured in + * abbreviations. + */ + #ifdef DEBUG_ABBREV_KEYS + { + double norm_abbrev_card = abbrev_distinct / (double) memtupcount; + + elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f)", + memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card); + } + #endif + + /* + * If the number of distinct abbreviated keys approximately matches the + * number of distinct authoritative original keys, that's reason enough to + * proceed. We can win even with a very low cardinality set if most + * tie-breakers only memcmp(). This is by far the most important + * consideration. + * + * While comparisons that are resolved at the abbreviated key level are + * considerably cheaper than tie-breakers resolved with memcmp(), both of + * those two outcomes are so much cheaper than a full strcoll() once + * sorting is underway that it doesn't seem worth it to weigh abbreviated + * cardinality against the overall size of the set in order to more + * accurately model costs. + */ + if (abbrev_distinct > key_distinct * 0.05) + return false; + + /* + * Normalized cardinality is proportion of distinct original, authoritative + * keys + */ + norm_key_card = key_distinct / (double) memtupcount; + + /* + * If this is a very low cardinality set generally, that is reason enough + * to favor our strategy, since many comparisons can be resolved with + * inexpensive memcmp() tie-breakers, even though abbreviated keys are of + * marginal utility. + */ + if (norm_key_card < 0.05) + return false; + + /* + * Abort abbreviation strategy. + * + * The worst case, where all abbreviated keys are identical while all + * original strings differ will typically only see a regression of about + * 10% in execution time for small to medium sized lists of strings. + * Whereas on modern CPUs where cache stalls are the dominant cost, we can + * often expect very large improvements, particularly with sets of strings + * of moderately high to high abbreviated cardinality. There is little to + * lose but much to gain, which our strategy reflects. + */ + #ifdef DEBUG_ABBREV_KEYS + elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f", + memtupcount, abbrev_distinct, key_distinct); + /* Actually abort only when debugging is disabled */ + return false; + #endif + + return true; + } + Datum text_larger(PG_FUNCTION_ARGS) { diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c new file mode 100644 index 8e57505..b0041b9 *** a/src/backend/utils/sort/tuplesort.c --- b/src/backend/utils/sort/tuplesort.c *************** bool optimize_bounded_sort = true; *** 150,156 **** * When sorting single Datums, the data value is represented directly by * datum1/isnull1. If the datatype is pass-by-reference and isnull1 is false, * then datum1 points to a separately palloc'd data value that is also pointed ! * to by the "tuple" pointer; otherwise "tuple" is NULL. * * While building initial runs, tupindex holds the tuple's run number. During * merge passes, we re-use it to hold the input tape number that each tuple in --- 150,159 ---- * When sorting single Datums, the data value is represented directly by * datum1/isnull1. If the datatype is pass-by-reference and isnull1 is false, * then datum1 points to a separately palloc'd data value that is also pointed ! * to by the "tuple" pointer; otherwise "tuple" is NULL. There is one special ! * case: when the sort support infrastructure provides an "abbreviated key" ! * representation, where the key is (typically) a pass by value proxy for a ! * pass by reference type. * * While building initial runs, tupindex holds the tuple's run number. During * merge passes, we re-use it to hold the input tape number that each tuple in *************** struct Tuplesortstate *** 353,358 **** --- 356,370 ---- SortSupport onlyKey; /* + * Additional state for managing "abbreviated key" sortsupport routines + * (currently only used in the MinimalTuple case). Tracks the intervals + * at which the optimization's effectiveness is tested. + */ + int nextabbrev; /* Tuple # at which to next check applicability */ + bool aborted; /* Abbreviation process aborted? */ + double rowsHint; /* Hint # rows to be sorted, -1 if unknown */ + + /* * These variables are specific to the CLUSTER case; they are set by * tuplesort_begin_cluster. Note CLUSTER also uses tupDesc and * indexScanKey. *************** tuplesort_begin_heap(TupleDesc tupDesc, *** 600,606 **** int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, ! int workMem, bool randomAccess) { Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); MemoryContext oldcontext; --- 612,619 ---- int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, ! int workMem, double planRows, ! bool randomAccess) { Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); MemoryContext oldcontext; *************** tuplesort_begin_heap(TupleDesc tupDesc, *** 632,637 **** --- 645,652 ---- state->reversedirection = reversedirection_heap; state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->nextabbrev = 10; + state->rowsHint = planRows; /* Prepare SortSupport data for each column */ state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); *************** tuplesort_begin_heap(TupleDesc tupDesc, *** 648,657 **** sortKey->ssup_nulls_first = nullsFirstFlags[i]; sortKey->ssup_attno = attNums[i]; PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); } ! if (nkeys == 1) state->onlyKey = state->sortKeys; MemoryContextSwitchTo(oldcontext); --- 663,687 ---- sortKey->ssup_nulls_first = nullsFirstFlags[i]; sortKey->ssup_attno = attNums[i]; + /* + * Convey to sortsupport routine if abbreviatioin optimization is + * applicable in principle + */ + if (i == 0) + sortKey->position = sortKeyAbbreviated; + else + sortKey->position = sortKeyOther; + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); } ! /* ! * The "onlyKey" optimization cannot be used with abbreviated keys, since ! * tie-breaker comparisons may be required. Typically, the optimization is ! * only of value to pass-by-value types anyway, whereas abbreviated keys ! * are typically only of value to pass-by-reference types. ! */ ! if (nkeys == 1 && !state->sortKeys->converter) state->onlyKey = state->sortKeys; MemoryContextSwitchTo(oldcontext); *************** tuplesort_begin_datum(Oid datumType, Oid *** 838,843 **** --- 868,880 ---- /* Prepare SortSupport data */ state->onlyKey = (SortSupport) palloc0(sizeof(SortSupportData)); + /* + * Conversion to abbreviated representation infeasible in the Datum case. + * It must be possible to subsequently fetch original datum values within + * tuplesort_getdatum(), which would require special-case preservation of + * original values that we prefer to avoid. + */ + state->onlyKey->position = sortKeyOther; state->onlyKey->ssup_cxt = CurrentMemoryContext; state->onlyKey->ssup_collation = sortCollation; state->onlyKey->ssup_nulls_first = nullsFirstFlag; *************** tuplesort_set_bound(Tuplesortstate *stat *** 886,891 **** --- 923,938 ---- state->bounded = true; state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable. + */ + if (state->sortKeys && state->sortKeys->converter) + { + state->sortKeys = state->sortKeys->tiebreak; + state->sortKeys->position = sortKeyOther; + } } /* *************** comparetup_heap(const SortTuple *a, cons *** 2861,2872 **** int nkey; int32 compare; ! /* Compare the leading sort key */ ! compare = ApplySortComparator(a->datum1, a->isnull1, ! b->datum1, b->isnull1, ! sortKey); ! if (compare != 0) ! return compare; /* Compare additional sort keys */ ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; --- 2908,2922 ---- int nkey; int32 compare; ! if (!state->aborted) ! { ! /* Compare the leading sort key */ ! compare = ApplySortComparator(a->datum1, a->isnull1, ! b->datum1, b->isnull1, ! sortKey); ! if (compare != 0) ! return compare; ! } /* Compare additional sort keys */ ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; *************** comparetup_heap(const SortTuple *a, cons *** 2874,2879 **** --- 2924,2958 ---- rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); tupDesc = state->tupDesc; + + /* + * If a leading abbreviated comparison returned 0 or abbreviation strategy + * was abandoned, call tie-breaker comparator using original, authoritative + * representation, which may break tie when differences were not captured + * within abbreviated representation + */ + if (state->sortKeys->converter) + { + AttrNumber attno = sortKey->ssup_attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + Assert(attno == sortKey->tiebreak->ssup_attno); + Assert(sortKey->position == sortKeyAbbreviated); + Assert(sortKey->tiebreak->position == sortKeyTiebreak); + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey->tiebreak); + if (compare != 0) + return compare; + } + sortKey++; for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) { *************** copytup_heap(Tuplesortstate *state, Sort *** 2914,2923 **** /* set up first-column key value */ htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); ! stup->datum1 = heap_getattr(&htup, ! state->sortKeys[0].ssup_attno, ! state->tupDesc, ! &stup->isnull1); } static void --- 2993,3035 ---- /* set up first-column key value */ htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); ! ! /* Once aborted, we give up on storing anything in datum1 entirely */ ! if (state->aborted) ! return; ! ! if (!state->sortKeys->converter) ! { ! /* Store ordinary Datum representation */ ! stup->datum1 = heap_getattr(&htup, ! state->sortKeys[0].ssup_attno, ! state->tupDesc, ! &stup->isnull1); ! } ! else ! { ! Datum original; ! ! /* Store abbreviated key representation */ ! original = heap_getattr(&htup, state->sortKeys[0].ssup_attno, ! state->tupDesc, &stup->isnull1); ! ! if (stup->isnull1) ! stup->datum1 = original; ! else ! stup->datum1 = state->sortKeys->converter(original, ! state->sortKeys); ! ! if (state->memtupcount >= state->nextabbrev) ! { ! /* Check effectiveness of optimization. Consider aborting. */ ! state->nextabbrev *= 2; ! if (state->sortKeys->abort_conversion(state->memtupcount, ! state->rowsHint, ! state->sortKeys)) ! state->aborted = true; ! } ! } } static void *************** reversedirection_heap(Tuplesortstate *st *** 2983,2988 **** --- 3095,3109 ---- sortKey->ssup_reverse = !sortKey->ssup_reverse; sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; } + + if (state->sortKeys->tiebreak) + { + sortKey = state->sortKeys->tiebreak; + + Assert(sortKey->position == sortKeyTiebreak); + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } } diff --git a/src/include/lib/hyperloglog.h b/src/include/lib/hyperloglog.h new file mode 100644 index ...8a1e15f *** a/src/include/lib/hyperloglog.h --- b/src/include/lib/hyperloglog.h *************** *** 0 **** --- 1,67 ---- + /* + * hyperloglog.h + * + * A simple HyperLogLog cardinality estimator implementation + * + * Portions Copyright (c) 2014, PostgreSQL Global Development Group + * + * Based on Hideaki Ohno's C++ implementation. The copyright term's of Ohno's + * original version (the MIT license) follow. + * + * src/include/lib/hyperloglog.h + */ + + /* + * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + + #ifndef HYPERLOGLOG_H + #define HYPERLOGLOG_H + + /* + * HyperLogLog is an approximate technique for computing the number of distinct + * entries in a set. Importantly, it does this by using a fixed amount of + * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal + * cardinality estimation algorithm" for more. + * + * hyperLogLogState + * + * registerWidth register width, in bits ("k") + * nRegisters number of registers + * alphaMM alpha * m ^ 2 (see initHyperLogLog()) + * hashesArr array of hashes + * arrSize size of hashesArr + */ + typedef struct hyperLogLogState + { + uint8 registerWidth; + Size nRegisters; + double alphaMM; + uint8 *hashesArr; + Size arrSize; + } hyperLogLogState; + + extern void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth); + extern void addHyperLogLog(hyperLogLogState *cState, uint32 hash); + extern double estimateHyperLogLog(hyperLogLogState *cState); + extern void mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState); + + #endif /* HYPERLOGLOG_H */ diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h new file mode 100644 index d78f38e..a27f5e3 *** a/src/include/pg_config_manual.h --- b/src/include/pg_config_manual.h *************** *** 222,234 **** #endif /* ! * Assumed cache line size. This doesn't affect correctness, but can be ! * used for low-level optimizations. Currently, this is only used to pad ! * some data structures in xlog.c, to ensure that highly-contended fields ! * are on different cache lines. Too small a value can hurt performance due ! * to false sharing, while the only downside of too large a value is a few ! * bytes of wasted memory. The default is 128, which should be large enough ! * for all supported platforms. */ #define CACHE_LINE_SIZE 128 --- 222,234 ---- #endif /* ! * Assumed cache line size. This doesn't affect correctness, but can be used ! * for low-level optimizations. Currently, this is used to pad some data ! * structures in xlog.c, to ensure that highly-contended fields are on ! * different cache lines. Too small a value can hurt performance due to false ! * sharing, while the only downside of too large a value is a few bytes of ! * wasted memory. The default is 128, which should be large enough for all ! * supported platforms. */ #define CACHE_LINE_SIZE 128 diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h new file mode 100644 index 8b6b0de..e7cc0b8 *** a/src/include/utils/sortsupport.h --- b/src/include/utils/sortsupport.h *************** *** 21,27 **** * required to provide all of them. The BTSORTSUPPORT function should * simply not set any function pointers for mechanisms it doesn't support. * (However, all opclasses that provide BTSORTSUPPORT are required to provide ! * the comparator function.) * * All sort support functions will be passed the address of the * SortSupportData struct when called, so they can use it to store --- 21,29 ---- * required to provide all of them. The BTSORTSUPPORT function should * simply not set any function pointers for mechanisms it doesn't support. * (However, all opclasses that provide BTSORTSUPPORT are required to provide ! * the comparator function, and opclasses that support the optional additional ! * abbreviated key capability must provide a tiebreaker state, converter and a ! * comparator that works with the abbreviated representation.) * * All sort support functions will be passed the address of the * SortSupportData struct when called, so they can use it to store *************** *** 49,54 **** --- 51,63 ---- #include "access/attnum.h" + typedef enum + { + sortKeyOther, /* Second/subsequent key, or no abbreviation capability */ + sortKeyAbbreviated, /* Leading (abbreviated applicable) key (when available) */ + sortKeyTiebreak, /* "True" leading key, abbreviation tie-breaker */ + } SortKeyPosition; + typedef struct SortSupportData *SortSupport; typedef struct SortSupportData *************** typedef struct SortSupportData *** 92,103 **** * than, equal to, or greater than y. Note that x and y are guaranteed * not null, and there is no way to return null either. Do not return * INT_MIN, as callers are allowed to negate the result before using it. */ int (*comparator) (Datum x, Datum y, SortSupport ssup); /* ! * Additional sort-acceleration functions might be added here later. */ } SortSupportData; --- 101,203 ---- * than, equal to, or greater than y. Note that x and y are guaranteed * not null, and there is no way to return null either. Do not return * INT_MIN, as callers are allowed to negate the result before using it. + * + * This comparator may require a tie-breaker for opclasses with additional + * special support for dealing with an abbreviated key representation. */ int (*comparator) (Datum x, Datum y, SortSupport ssup); /* ! * "Abbreviated key" infrastructure follows. All callbacks must be set by ! * sortsupport opclasses that make use of this optional additional ! * infrastructure. ! * ! * This allows opclass authors to supply a conversion routine, used to ! * create an alternative representation of the underlying type (an ! * "abbreviated key"). Typically, this representation is an ad-hoc, ! * pass-by-value Datum format that only the opclass has knowledge of. An ! * alternative comparator, used only with this alternative representation ! * must also be provided. This representation is a simple approximation of ! * the original Datum. It must be possible to compare datums of this ! * representation with each other using the supplied alternative ! * comparator, and have any non-zero return value be a reliable proxy for ! * what a proper comparison would indicate. Returning zero from the ! * alternative comparator does not indicate equality, as with a ! * conventional support routine 1, though -- it indicates that it wasn't ! * possible to determine how the two abbreviated values compared. A proper ! * comparison is therefore required. In many cases this results in most or ! * all comparisons only using the cheap alternative comparison func, which ! * is typically implemented as code that compiles to just a few CPU ! * instructions. The technique is particularly useful for internal sorts ! * (i.e. quicksort). CPU cache miss penalties have grown to the point ! * where good overall performance must heavily weigh cache performance. ! * ! * Opclass authors must consider the final cardinality of abbreviated keys ! * when devising an encoding scheme. It's possible for a strategy to work ! * better than an alternative strategy with one usage pattern, while the ! * reverse might be true for another usage pattern. All of these factors ! * should be weighed. ! */ ! ! /* ! * Sort key "position" mostly just relates to whether or not the ! * abbreviated key optimization is applicable in principle (that is, the ! * sortsupport routine needs to know if its dealing with a leading key). ! * Even with a leading key, internal sortsupport clients like tuplesort may ! * represent it as sortKeyOther because it isn't feasible to inject the ! * conversion routine. However, sortKeyTiebreak means that it's a "proper" ! * sortsupport state, originally generated by the sortsupport routine ! * itself - the core system will never directly create a sort support state ! * with tie-break position. There is very little distinction between ! * sortKeyTiebreak and sortKeyOther positions, though. The distinction ! * only exists to allow sortsupport routines to squeeze a bit more ! * performance from the knowledge that an authoritative tie-breaker ! * comparison is required because a prior alternative comparison didn't ! * work out (as opposed to being called without there ever being an ! * abbreviated comparison of the tuple attribute). ! * ! * A sortsupport routine may initially decide against applying the ! * abbreviation optimization by setting a passed sortKeyAbbreviated sort ! * support state's position to sortKeyOther. This is typically used for ! * platform-specific special cases where the optimization is thought to be ! * ineffective. ! */ ! SortKeyPosition position; ! ! /* ! * Converter to abbreviated format, from original representation. Core ! * code uses this callback to convert from a pass-by-reference "original" ! * Datum to a pass-by-value abbreviated key Datum. Note that original is ! * guaranteed not null because it doesn't make sense to factor NULL values ! * into ad-hoc cost model. ! */ ! Datum (*converter) (Datum original, SortSupport ssup); ! ! /* ! * This callback allows clients to verify that the current strategy is ! * working out, using a sortsupport routine defined ad-hoc cost model. If ! * there is a lot of duplicate abbreviated keys in practice, it's useful to ! * be able to abandon the strategy before paying too high a cost in ! * conversion. rowhint is typically an estimate of total rows that will be ! * sorted originating from the planner, but clients are at liberty to ! * provide a new hint with each call, should better information become ! * available during the course of conversion. (By convention, negative ! * rowhint values are interpreted as "no hint available".) ! */ ! bool (*abort_conversion) (int memtupcount, double rowhint, ! SortSupport ssup); ! ! /* ! * Alternative tiebreak SortSupport state for leading (abbreviated) key, ! * used only when alternative comparator returned 0, and the core system ! * must use this separate state to perform an authoritative comparison. ! * This relates to the same attribute as our ssup_attno, but code like ! * tuplesort is required to call it directly as a tie-breaker. ! * ! * It is only ever initialized by a SortSupport routine with abbreviation ! * support, and not any internal code. */ + SortSupport tiebreak; } SortSupportData; diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h new file mode 100644 index 2537883..e5de4f4 *** a/src/include/utils/tuplesort.h --- b/src/include/utils/tuplesort.h *************** extern Tuplesortstate *tuplesort_begin_h *** 62,68 **** int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, ! int workMem, bool randomAccess); extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc, Relation indexRel, int workMem, bool randomAccess); --- 62,68 ---- int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, ! int workMem, double planRows, bool randomAccess); extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc, Relation indexRel, int workMem, bool randomAccess);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers