On Fri, Aug 22, 2014 at 7:19 AM, Robert Haas <[email protected]> wrote:
> Patch 0002 no longer applies; please rebase.
I attach rebased patch.
Note that there is currently a bug in the master branch:
+ if (len2 >= tss->buflen2)
+ {
+ pfree(tss->buf2);
+ tss->buflen1 = Max(len2 + 1, Min(tss->buflen2 * 2, MaxAllocSize));
+ tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2);
+ }
Thanks
--
Peter Geoghegan
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
new file mode 100644
index c09ca7e..adf4b8c
*** a/src/backend/commands/analyze.c
--- b/src/backend/commands/analyze.c
*************** compute_scalar_stats(VacAttrStatsP stats
*** 2292,2297 ****
--- 2292,2303 ----
/* We always use the default collation for statistics */
ssup.ssup_collation = DEFAULT_COLLATION_OID;
ssup.ssup_nulls_first = false;
+ /*
+ * For now, don't perform abbreviated key conversion, because full values
+ * are required for MCV slot generation. Supporting that optimization
+ * would necessitate teaching compare_scalars() to call a tie-breaker.
+ */
+ ssup.position = sortKeyOther;
PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup);
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
new file mode 100644
index 510d1c5..3b335e7
*** a/src/backend/executor/nodeAgg.c
--- b/src/backend/executor/nodeAgg.c
*************** initialize_aggregates(AggState *aggstate
*** 346,351 ****
--- 346,352 ----
{
AggStatePerAgg peraggstate = &peragg[aggno];
AggStatePerGroup pergroupstate = &pergroup[aggno];
+ Agg *node = (Agg *) aggstate->ss.ps.plan;
/*
* Start a fresh sort operation for each DISTINCT/ORDER BY aggregate.
*************** initialize_aggregates(AggState *aggstate
*** 363,368 ****
--- 364,373 ----
* We use a plain Datum sorter when there's a single input column;
* otherwise sort the full tuple. (See comments for
* process_ordered_aggregate_single.)
+ *
+ * FIXME: It ought to be useful to force tuplesort_begin_heap()
+ * case where the abbreviated key optimization can thereby be used,
+ * even when numInputs == 1.
*/
peraggstate->sortstate =
(peraggstate->numInputs == 1) ?
*************** initialize_aggregates(AggState *aggstate
*** 377,383 ****
peraggstate->sortOperators,
peraggstate->sortCollations,
peraggstate->sortNullsFirst,
! work_mem, false);
}
/*
--- 382,390 ----
peraggstate->sortOperators,
peraggstate->sortCollations,
peraggstate->sortNullsFirst,
! work_mem,
! node->plan.plan_rows,
! false);
}
/*
diff --git a/src/backend/executor/nodeMergeAppend.c b/src/backend/executor/nodeMergeAppend.c
new file mode 100644
index 47ed068..677a953
*** a/src/backend/executor/nodeMergeAppend.c
--- b/src/backend/executor/nodeMergeAppend.c
*************** ExecInitMergeAppend(MergeAppend *node, E
*** 137,142 ****
--- 137,151 ----
sortKey->ssup_nulls_first = node->nullsFirst[i];
sortKey->ssup_attno = node->sortColIdx[i];
+ /*
+ * It isn't feasible to perform abbreviated key conversion, since
+ * tuples are pulled into mergestate's binary heap as needed. It would
+ * likely be counter-productive to convert tuples into an abbreviated
+ * representation as they're pulled up, so opt out of that additional
+ * optimization entirely.
+ */
+ sortKey->position = sortKeyOther;
+
PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey);
}
diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c
new file mode 100644
index fdf2f4c..39f0ff1
*** a/src/backend/executor/nodeMergejoin.c
--- b/src/backend/executor/nodeMergejoin.c
*************** MJExamineQuals(List *mergeclauses,
*** 229,234 ****
--- 229,242 ----
elog(ERROR, "cannot merge using non-equality operator %u",
qual->opno);
+ /*
+ * sortsupport routine must know if abbreviation optimization is
+ * applicable in principle. It is never applicable for merge joins
+ * because there is no convenient opportunity to convert to alternative
+ * representation. Only elide fmgr trampoline where supported.
+ */
+ clause->ssup.position = sortKeyOther;
+
/* And get the matching support or comparison function */
Assert(clause->ssup.comparator == NULL);
sortfunc = get_opfamily_proc(opfamily,
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
new file mode 100644
index b88571b..31d3ead
*** a/src/backend/executor/nodeSort.c
--- b/src/backend/executor/nodeSort.c
*************** ExecSort(SortState *node)
*** 89,94 ****
--- 89,95 ----
plannode->collations,
plannode->nullsFirst,
work_mem,
+ plannode->plan.plan_rows,
node->randomAccess);
if (node->bounded)
tuplesort_set_bound(tuplesortstate, node->bound);
diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile
new file mode 100644
index 327a1bc..14f9a46
*** a/src/backend/lib/Makefile
--- b/src/backend/lib/Makefile
*************** subdir = src/backend/lib
*** 12,17 ****
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
! OBJS = ilist.o binaryheap.o stringinfo.o
include $(top_srcdir)/src/backend/common.mk
--- 12,17 ----
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
! OBJS = ilist.o binaryheap.o hyperloglog.o stringinfo.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c
new file mode 100644
index ...4db42bf
*** a/src/backend/lib/hyperloglog.c
--- b/src/backend/lib/hyperloglog.c
***************
*** 0 ****
--- 1,228 ----
+ /*-------------------------------------------------------------------------
+ *
+ * hyperloglog.c
+ * HyperLogLog cardinality estimator
+ *
+ * Portions Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ * Based on Hideaki Ohno's C++ implementation. This is probably not ideally
+ * suited to estimating the cardinality of very large sets; in particular, we
+ * have not attempted to further optimize the implementation as described in
+ * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+ * Engineering of a State of The Art Cardinality Estimation Algorithm".
+ *
+ * A sparse representation of HyperLogLog state is used, with fixed space
+ * overhead.
+ *
+ * The copyright term's of Ohno's original version (the MIT license) follow.
+ *
+ * IDENTIFICATION
+ * src/backend/lib/hyperloglog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+ /*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+ #include "postgres.h"
+
+ #include <math.h>
+
+ #include "lib/hyperloglog.h"
+
+ #define POW_2_32 (4294967296.0)
+ #define NEG_POW_2_32 (-4294967296.0)
+
+ static inline uint8 rho(uint32 x, uint8 b);
+
+ /*
+ * Initialize HyperLogLog track state
+ *
+ * bwidth is bit width (so register size will be 2 to the power of bwidth).
+ * Must be between 4 and 16 inclusive.
+ */
+ void
+ initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
+ {
+ double alpha;
+
+ if (bwidth < 4 || bwidth > 16)
+ elog(ERROR, "bit width must be between 4 and 16 inclusive");
+
+ cState->registerWidth = bwidth;
+ cState->nRegisters = 1 << bwidth;
+ cState->arrSize = sizeof(uint8) * cState->nRegisters + 1;
+
+ /*
+ * Initialize hashes array to zero, not negative infinity, per discussion
+ * of the coupon collector problem in the HyperLogLog paper
+ */
+ cState->hashesArr = palloc0(cState->arrSize);
+
+ /*
+ * "alpha" is a value that for each possible number of registers (m) is
+ * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z
+ * is "the indicator function" through which we finally compute E,
+ * estimated cardinality).
+ */
+ switch (cState->nRegisters)
+ {
+ case 16:
+ alpha = 0.673;
+ break;
+ case 32:
+ alpha = 0.697;
+ break;
+ case 64:
+ alpha = 0.709;
+ break;
+ default:
+ alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters);
+ }
+
+ /*
+ * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog
+ * estimate E
+ */
+ cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters;
+ }
+
+ /*
+ * Adds element to the estimator, from caller-supplied hash.
+ *
+ * It is critical that the hash value passed be an actual hash value, typically
+ * generated using hash_any(). The algorithm relies on a specific bit-pattern
+ * observable in conjunction with stochastic averaging. There must be a
+ * uniform distribution of bits in hash values for each distinct original value
+ * observed.
+ */
+ void
+ addHyperLogLog(hyperLogLogState *cState, uint32 hash)
+ {
+ uint8 count;
+ uint32 index;
+
+ /* Use the first "k" (registerWidth) bits as a zero based index */
+ index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+
+ /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
+ count = rho(hash << cState->registerWidth,
+ BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+
+ cState->hashesArr[index] = Max(count, cState->hashesArr[index]);
+ }
+
+ /*
+ * Estimates cardinality, based on elements added so far
+ */
+ double
+ estimateHyperLogLog(hyperLogLogState *cState)
+ {
+ double result;
+ double sum = 0.0;
+ int i;
+
+ for (i = 0; i < cState->nRegisters; i++)
+ {
+ sum += 1.0 / pow(2.0, cState->hashesArr[i]);
+ }
+
+ /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
+ result = cState->alphaMM / sum;
+
+ if (result <= (5.0 / 2.0) * cState->nRegisters)
+ {
+ /* Small range correction */
+ int zero_count = 0;
+
+ for (i = 0; i < cState->nRegisters; i++)
+ {
+ if (cState->hashesArr[i] == 0)
+ zero_count++;
+ }
+
+ if (zero_count != 0)
+ result = cState->nRegisters * log((double) cState->nRegisters /
+ zero_count);
+ }
+ else if (result > (1.0 / 30.0) * POW_2_32)
+ {
+ /* Large range correction */
+ result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
+ }
+
+ return result;
+ }
+
+ /*
+ * Merges the estimate from one HyperLogLog state to another, returning the
+ * estimate of their union.
+ *
+ * The number of registers in each must match.
+ */
+ void
+ mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState)
+ {
+ int r;
+
+ if (cState->nRegisters != oState->nRegisters)
+ elog(ERROR, "number of registers mismatch: %zu != %zu",
+ cState->nRegisters, oState->nRegisters);
+
+ for (r = 0; r < cState->nRegisters; ++r)
+ {
+ cState->hashesArr[r] = Max(cState->hashesArr[r], oState->hashesArr[r]);
+ }
+ }
+
+
+ /*
+ * Worker for addHyperLogLog().
+ *
+ * Calculates the position of the first set bit in first b bits of x argument
+ * starting from the first, reading from most significant to least significant
+ * bits.
+ *
+ * Example (when considering fist 10 bits of x):
+ *
+ * rho(x = 0b1000000000) returns 1
+ * rho(x = 0b0010000000) returns 3
+ * rho(x = 0b0000000000) returns b + 1
+ *
+ * "The binary address determined by the first b bits of x"
+ *
+ * Return value "j" used to index bit pattern to watch.
+ */
+ static inline uint8
+ rho(uint32 x, uint8 b)
+ {
+ uint8 j = 1;
+
+ while (j <= b && !(x & 0x80000000))
+ {
+ j++;
+ x <<= 1;
+ }
+
+ return j;
+ }
diff --git a/src/backend/utils/adt/orderedsetaggs.c b/src/backend/utils/adt/orderedsetaggs.c
new file mode 100644
index 135a14b..0a36c84
*** a/src/backend/utils/adt/orderedsetaggs.c
--- b/src/backend/utils/adt/orderedsetaggs.c
*************** ordered_set_startup(FunctionCallInfo fci
*** 274,280 ****
qstate->sortOperators,
qstate->sortCollations,
qstate->sortNullsFirsts,
! work_mem, false);
else
osastate->sortstate = tuplesort_begin_datum(qstate->sortColType,
qstate->sortOperator,
--- 274,280 ----
qstate->sortOperators,
qstate->sortCollations,
qstate->sortNullsFirsts,
! work_mem, -1, false);
else
osastate->sortstate = tuplesort_begin_datum(qstate->sortColType,
qstate->sortOperator,
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
new file mode 100644
index 1bd8abb..96df4a9
*** a/src/backend/utils/adt/varlena.c
--- b/src/backend/utils/adt/varlena.c
***************
*** 17,25 ****
--- 17,27 ----
#include <ctype.h>
#include <limits.h>
+ #include "access/hash.h"
#include "access/tuptoaster.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
+ #include "lib/hyperloglog.h"
#include "libpq/md5.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
***************
*** 32,37 ****
--- 34,42 ----
#include "utils/pg_locale.h"
#include "utils/sortsupport.h"
+ #ifdef DEBUG_ABBREV_KEYS
+ #define DEBUG_elog_output DEBUG1
+ #endif
/* GUC variable */
int bytea_output = BYTEA_OUTPUT_HEX;
*************** typedef struct
*** 54,63 ****
typedef struct
{
! char *buf1; /* 1st string */
! char *buf2; /* 2nd string */
int buflen1;
int buflen2;
#ifdef HAVE_LOCALE_T
pg_locale_t locale;
#endif
--- 59,70 ----
typedef struct
{
! char *buf1; /* 1st string, or abbreviation original string buf */
! char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
int buflen1;
int buflen2;
+ hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
+ hyperLogLogState full_card; /* Full key cardinality state */
#ifdef HAVE_LOCALE_T
pg_locale_t locale;
#endif
*************** typedef struct
*** 75,83 ****
--- 82,98 ----
#define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
#define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
+ /*
+ * Used for calculating number of sort comparisons
+ */
+ #define LOG2(x) (log(x) / 0.693147180559945)
+
static void btsortsupport_worker(SortSupport ssup, Oid collid);
+ static int bttextcmp_abbreviated(Datum x, Datum y, SortSupport ssup);
static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup);
static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup);
+ static Datum bttext_convert(Datum original, SortSupport ssup);
+ static bool bttext_abort(int memtupcount, double rowhint, SortSupport ssup);
static int32 text_length(Datum str);
static text *text_catenate(text *t1, text *t2);
static text *text_substring(Datum str,
*************** btsortsupport_worker(SortSupport ssup, O
*** 1725,1750 ****
TextSortSupport *tss;
/*
- * If LC_COLLATE = C, we can make things quite a bit faster by using
- * memcmp() rather than strcoll(). To minimize the per-comparison
- * overhead, we make this decision just once for the whole sort.
- */
- if (lc_collate_is_c(collid))
- {
- ssup->comparator = bttextfastcmp_c;
- return;
- }
-
- /*
* WIN32 requires complex hacks when the database encoding is UTF-8 (except
* when using the "C" collation). For now, we don't optimize that case.
*/
#ifdef WIN32
! if (GetDatabaseEncoding() == PG_UTF8)
return;
#endif
/*
* We may need a collation-sensitive comparison. To make things faster,
* we'll figure out the collation based on the locale id and cache the
* result. Also, since strxfrm()/strcoll() require NUL-terminated inputs,
--- 1740,1811 ----
TextSortSupport *tss;
/*
* WIN32 requires complex hacks when the database encoding is UTF-8 (except
* when using the "C" collation). For now, we don't optimize that case.
*/
#ifdef WIN32
! if (GetDatabaseEncoding() == PG_UTF8 && !lc_collate_is_c(collid))
return;
#endif
/*
+ * On platforms where the abbreviated key for text optimization might have
+ * bad worst case performance it is avoided entirely. In order to ever
+ * apply the optimization we require:
+ *
+ * * That the platform's strxfrm() meets a certain standard for
+ * representing as much information as possible in leading bytes.
+ *
+ * * That there are a full 8 bytes of storage per Datum on the platform,
+ * since we pack bytes into that representation. Having only 4 bytes
+ * could make worse case performance drastically more likely.
+ *
+ * The standard applied for strxfrm() is that a significant amount of
+ * entropy from the original string must be concentrated at the beginning
+ * of returned blobs. The Mac OS X implementation is known to produce
+ * blobs with very little entropy; it produces printable blobs with a
+ * series of 24-bit weights, each one encoded into four bytes. Multiple
+ * "levels" of weights are separated by a weight of 0. Typically only two
+ * ASCII characters are represented in the first eight bytes. Therefore,
+ * the optimization is specially disabled.
+ *
+ * Any reasonable implementation will pack primary weights into the start
+ * of returned blobs. The canonical algorithm's implementation is
+ * discussed by Unicode Technical Standard #10 ("UNICODE COLLATION
+ * ALGORITHM"), section 4, "Main algorithm". Section 4.3, "Form Sort Key"
+ * is of particular interest:
+ *
+ * http://www.unicode.org/reports/tr10/#Step_3
+ *
+ * The collation algorithm standard goes on to state:
+ *
+ * "By default, the algorithm makes use of three fully-customizable levels.
+ * For the Latin script, these levels correspond roughly to:
+ *
+ * alphabetic ordering
+ *
+ * diacritic ordering
+ *
+ * case ordering.
+ *
+ * A final level may be used for tie-breaking between strings not otherwise
+ * distinguished."
+ *
+ * It is generally expected that most non-equal keys will have their
+ * comparisons resolved at the primary level. If enough comparisons can be
+ * resolved with just 8 byte abbreviated keys, this optimization is very
+ * effective (although if there are many tie-breakers that largely only
+ * perform cheap memcmp() calls, that is also much faster than the
+ * unoptimized case - see bttext_abort()).
+ *
+ * There is no reason to not at least perform fmgr elision on platforms
+ * where abbreviation isn't expected to be profitable, though.
+ */
+ #if defined(__darwin__) || SIZEOF_DATUM != 8
+ ssup->type = sortKeyOther;
+ #endif
+
+ /*
* We may need a collation-sensitive comparison. To make things faster,
* we'll figure out the collation based on the locale id and cache the
* result. Also, since strxfrm()/strcoll() require NUL-terminated inputs,
*************** btsortsupport_worker(SortSupport ssup, O
*** 1777,1789 ****
#endif
}
tss->buf1 = palloc(TEXTBUFLEN);
tss->buflen1 = TEXTBUFLEN;
tss->buf2 = palloc(TEXTBUFLEN);
tss->buflen2 = TEXTBUFLEN;
ssup->ssup_extra = tss;
! ssup->comparator = bttextfastcmp_locale;
}
/*
--- 1838,1922 ----
#endif
}
+ if (ssup->position != sortKeyAbbreviated)
+ {
+ /*
+ * If LC_COLLATE = C, we can make things quite a bit faster by using
+ * memcmp() rather than strcoll(). To minimize the per-comparison
+ * overhead, we make this decision just once for the whole sort.
+ */
+ if (lc_collate_is_c(collid))
+ {
+ ssup->comparator = bttextfastcmp_c;
+ }
+ else
+ {
+ ssup->comparator = bttextfastcmp_locale;
+
+ /* locale case requires temp buffers */
+ tss->buf1 = palloc(TEXTBUFLEN);
+ tss->buflen1 = TEXTBUFLEN;
+ tss->buf2 = palloc(TEXTBUFLEN);
+ tss->buflen2 = TEXTBUFLEN;
+ ssup->ssup_extra = tss;
+ }
+
+ return;
+ }
+
+ /*
+ * Abbreviated case requires temp buffers for strxfrm() copying, and in the
+ * *_locale case possibly for eventual tie-breaker comparisons during
+ * sorting proper
+ */
tss->buf1 = palloc(TEXTBUFLEN);
tss->buflen1 = TEXTBUFLEN;
tss->buf2 = palloc(TEXTBUFLEN);
tss->buflen2 = TEXTBUFLEN;
ssup->ssup_extra = tss;
!
! initHyperLogLog(&tss->abbr_card, 10);
! initHyperLogLog(&tss->full_card, 10);
!
! ssup->comparator = bttextcmp_abbreviated;
! ssup->converter = bttext_convert;
! ssup->abort_conversion = bttext_abort;
!
! ssup->tiebreak = palloc0(sizeof(SortSupportData));
! ssup->tiebreak->ssup_cxt = ssup->ssup_cxt;
! ssup->tiebreak->ssup_collation = ssup->ssup_collation;
! ssup->tiebreak->ssup_reverse = ssup->ssup_reverse;
! ssup->tiebreak->ssup_nulls_first = ssup->ssup_nulls_first;
! ssup->tiebreak->ssup_attno = ssup->ssup_attno;
!
! /*
! * Initialize tiebreak sortsupport state with an authoritative
! * strcoll()-based comparison func for tie-breaking.
! */
! ssup->tiebreak->position = sortKeyTiebreak;
! btsortsupport_worker(ssup->tiebreak, collid);
! }
!
! /*
! * Abbreviated key comparison func
! */
! static int
! bttextcmp_abbreviated(Datum x, Datum y, SortSupport ssup)
! {
! char *a = (char *) &x;
! char *b = (char *) &y;
! int result;
!
! result = memcmp(a, b, sizeof(Datum));
!
! /*
! * When result = 0, the core system will call bttextfastcmp_c() or
! * bttextfastcmp_locale(). Even a strcmp() on two non-truncated strxfrm()
! * blobs cannot indicate *equality* authoritatively, for the same reason
! * that there is a strcoll() strcmp() tie-breaker in varstr_cmp().
! */
! return result;
}
/*
*************** bttextfastcmp_locale(Datum x, Datum y, S
*** 1842,1847 ****
--- 1975,2008 ----
len1 = VARSIZE_ANY_EXHDR(arg1);
len2 = VARSIZE_ANY_EXHDR(arg2);
+ if (ssup->position == sortKeyTiebreak && len1 == len2)
+ {
+ /*
+ * Being called as authoritative tie-breaker for an abbreviated key
+ * comparison.
+ *
+ * TODO: Consider using this optimization more frequently, perhaps
+ * even entirely opportunistically.
+ *
+ * In general there is a pretty good chance control reached here
+ * because the two keys are actually fully equal. Try and give an
+ * answer using only a cheap memcmp() comparison on the assumption that
+ * this will indicate equality frequently enough for it to be worth it
+ * on balance. This is a reasonable assumption, since sorting is
+ * almost certainly bottlenecked on memory latency.
+ *
+ * Original, authoritative key cardinality is weighed within
+ * bttext_abort(). Cases where attempts at resolving tie-breakers in
+ * this manner are the usual outcome, and yet those attempts usually
+ * fail should only arise infrequently.
+ */
+ if (memcmp(a1p, a2p, len1) == 0)
+ {
+ result = 0;
+ goto done;
+ }
+ }
+
if (len1 >= tss->buflen1)
{
pfree(tss->buf1);
*************** bttextfastcmp_locale(Datum x, Datum y, S
*** 1851,1857 ****
if (len2 >= tss->buflen2)
{
pfree(tss->buf2);
! tss->buflen1 = Max(len2 + 1, Min(tss->buflen2 * 2, MaxAllocSize));
tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2);
}
--- 2012,2018 ----
if (len2 >= tss->buflen2)
{
pfree(tss->buf2);
! tss->buflen2 = Max(len2 + 1, Min(tss->buflen2 * 2, MaxAllocSize));
tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2);
}
*************** bttextfastcmp_locale(Datum x, Datum y, S
*** 1875,1880 ****
--- 2036,2043 ----
if (result == 0)
result = strcmp(tss->buf1, tss->buf2);
+ done:
+
/* We can't afford to leak memory here. */
if (PointerGetDatum(arg1) != x)
pfree(arg1);
*************** bttextfastcmp_locale(Datum x, Datum y, S
*** 1884,1889 ****
--- 2047,2324 ----
return result;
}
+ /*
+ * Conversion routine for sortsupport. Converts original text to abbreviated
+ * key representation. Our encoding strategy is simple -- pack the first 8
+ * bytes of a strxfrm() blob into a Datum.
+ */
+ static Datum
+ bttext_convert(Datum original, SortSupport ssup)
+ {
+ TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra;
+ text *authoritative = DatumGetTextPP(original);
+
+ /* working state */
+ Datum res;
+ char *pres;
+ int len;
+ Size bsize;
+ uint32 lohalf,
+ hihalf,
+ hash;
+
+ /*
+ * Abbreviated key representation is a pass-by-value Datum that is treated
+ * as a char array by the specialized comparator bttextcmp_abbreviated().
+ */
+ pres = (char *) &res;
+ /* memset() so untouched bytes are NULL */
+ memset(pres, 0, sizeof(Datum));
+ len = VARSIZE_ANY_EXHDR(authoritative);
+
+ /* By convention, we use buffer 1 to store and NULL terminate text */
+ if (len >= tss->buflen1)
+ {
+ pfree(tss->buf1);
+ tss->buflen1 = Max(len + 1, Min(tss->buflen1 * 2, MaxAllocSize));
+ tss->buf1 = palloc(tss->buflen1);
+ }
+
+ /* Just like strcoll(), strxfrm() expects a NULL-terminated string */
+ memcpy(tss->buf1, VARDATA_ANY(authoritative), len);
+ tss->buf1[len] = '\0';
+
+ /* Don't leak memory here */
+ if (PointerGetDatum(authoritative) != original)
+ pfree(authoritative);
+
+ retry:
+
+ /*
+ * There is no special handling of the C locale here. strxfrm() is used
+ * indifferently.
+ */
+ #ifdef HAVE_LOCALE_T
+ if (tss->locale)
+ bsize = strxfrm_l(tss->buf2, tss->buf1, tss->buflen2, tss->locale);
+ else
+ #endif
+ bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2);
+
+ if (bsize >= tss->buflen2)
+ {
+ /*
+ * The C standard states that the contents of the buffer is now
+ * unspecified. Grow buffer, and retry.
+ */
+ pfree(tss->buf2);
+ tss->buflen2 = Max(bsize + 1, Min(tss->buflen2 * 2, MaxAllocSize));
+ tss->buf2 = palloc(tss->buflen2);
+ goto retry;
+ }
+
+ /*
+ * Maintain approximate cardinality of both abbreviated keys and original,
+ * authoritative keys using HyperLogLog. Used as cheap insurance against
+ * the worst case, where we do many string transformations for no saving in
+ * full strcoll()-based comparisons. These statistics are used by
+ * bttext_abort().
+ *
+ * First, Hash key proper, or a significant fraction of it. Mix in length
+ * in order to compensate for cases where differences are past
+ * CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
+ */
+ hash = hash_any((unsigned char *) tss->buf1, Min(len, CACHE_LINE_SIZE));
+
+ if (len > CACHE_LINE_SIZE)
+ hash ^= DatumGetUInt32(hash_uint32((uint32) len));
+
+ addHyperLogLog(&tss->full_card, hash);
+
+ memcpy(pres, tss->buf2, Min(sizeof(Datum), bsize));
+
+ /* Hash abbreviated key */
+ lohalf = (uint32) res;
+ hihalf = (uint32) (res >> 32);
+ hash = hash_uint32(lohalf ^ hihalf);
+
+ addHyperLogLog(&tss->abbr_card, hash);
+
+ /*
+ * Every Datum byte is compared. This is safe because the strxfrm() blob
+ * is itself NULL-terminated, leaving no danger of misinterpreting any NULL
+ * bytes not intended to be interpreted as logically representing
+ * termination.
+ */
+ return res;
+ }
+
+ /*
+ * Callback for estimating effectiveness of abbreviated key optimization, using
+ * heuristic rules. Returns value indicating if the abbreviation optimization
+ * should be aborted, based on its projected effectiveness.
+ */
+ static bool
+ bttext_abort(int memtupcount, double rowhint, SortSupport ssup)
+ {
+ TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra;
+ double abbrev_distinct,
+ key_distinct,
+ norm_key_card;
+
+ Assert(ssup->position == sortKeyAbbreviated);
+
+ /* Have a little patience, even without a rowhint */
+ if (memtupcount < 20)
+ return false;
+
+ if (rowhint > 0)
+ {
+ double normalized_rows_to_process,
+ estimated_cmps;
+
+ normalized_rows_to_process = (rowhint - memtupcount) / rowhint;
+
+ if (normalized_rows_to_process > 0.95 && memtupcount < 200000)
+ {
+ /*
+ * Be patient -- don't consider aborting until we've processed an
+ * estimated 5% of all rows to be sorted, or 200,000 rows,
+ * whichever is less.
+ */
+ #ifdef DEBUG_ABBREV_KEYS
+ elog(DEBUG_elog_output, "conversion patiently waited after %d tuples of %f",
+ memtupcount, rowhint);
+ #endif
+ return false;
+ }
+ else if (normalized_rows_to_process < 0.65)
+ {
+ /* Already too invested -- don't abort a marginal case */
+ return false;
+ }
+
+ /*
+ * strxfrm() is strongly recommended for large lists of strings. This
+ * is because despite the memory overhead often implied by an approach
+ * using string transformation, the number of comparisons that a
+ * comparison sort algorithm requires increases at least in proportion
+ * to O(n log n). Linearithmic growth will result in a number of
+ * comparisons that is considerably higher than the number of elements.
+ * (top-N heapsorts never use the abbreviation optimization, and so are
+ * not considered here.)
+ *
+ * Unicode Technical Standard #10 states "Because binary comparison is
+ * much faster than string comparison, it is faster to use sort keys
+ * whenever there will be more than about 10 comparisons per string, if
+ * the system can afford the storage". That would amount to
+ * approximately 1,000 list elements on average. While our costs are
+ * clearly different in several ways, this calculus cannot be ignored
+ * entirely. Past a certain point, we are probabilistically better off
+ * holding out for some improvement even if there is an abbreviated key
+ * cardinality of 1 thus far. That point is somewhat arbitrarily
+ * assumed to be 20 comparisons per string (approximately 1 million
+ * estimated rows). We may still lose, but not by terribly much, and
+ * only in cases close to the most pessimal worst case. Even in that
+ * very worst case, as this tuple count threshold is crossed the
+ * regression for internal sorts is at or under 5%. This is helped by
+ * fmgr elision, and not hurt much by useless abbreviated comparisons.
+ * (a strxfrm() is expensive, a memcmp() of a Datum already in L1 cache
+ * much less so.)
+ */
+ estimated_cmps = rowhint * LOG2(rowhint);
+
+ if (estimated_cmps > rowhint * 20)
+ {
+ #ifdef DEBUG_ABBREV_KEYS
+ elog(DEBUG_elog_output, "row estimate too high (%f, estimated cmps: %f) to ever abort",
+ rowhint, estimated_cmps);
+ #endif
+ return false;
+ }
+ }
+
+ abbrev_distinct = estimateHyperLogLog(&tss->abbr_card);
+ key_distinct = estimateHyperLogLog(&tss->full_card);
+
+ /*
+ * Clamp cardinality estimates to at least one distinct value. While NULLs
+ * are generally disregarded, if only NULL values were seen so far, that
+ * might misrepresent costs if we failed to clamp.
+ */
+ if (abbrev_distinct <= 1.0)
+ abbrev_distinct = 1.0;
+
+ if (key_distinct <= 1.0)
+ key_distinct = 1.0;
+
+ /*
+ * In the worst case all abbreviated keys are identical, while at the same
+ * time there are differences within full key strings not captured in
+ * abbreviations.
+ */
+ #ifdef DEBUG_ABBREV_KEYS
+ {
+ double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
+
+ elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f)",
+ memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card);
+ }
+ #endif
+
+ /*
+ * If the number of distinct abbreviated keys approximately matches the
+ * number of distinct authoritative original keys, that's reason enough to
+ * proceed. We can win even with a very low cardinality set if most
+ * tie-breakers only memcmp(). This is by far the most important
+ * consideration.
+ *
+ * While comparisons that are resolved at the abbreviated key level are
+ * considerably cheaper than tie-breakers resolved with memcmp(), both of
+ * those two outcomes are so much cheaper than a full strcoll() once
+ * sorting is underway that it doesn't seem worth it to weigh abbreviated
+ * cardinality against the overall size of the set in order to more
+ * accurately model costs.
+ */
+ if (abbrev_distinct > key_distinct * 0.05)
+ return false;
+
+ /*
+ * Normalized cardinality is proportion of distinct original, authoritative
+ * keys
+ */
+ norm_key_card = key_distinct / (double) memtupcount;
+
+ /*
+ * If this is a very low cardinality set generally, that is reason enough
+ * to favor our strategy, since many comparisons can be resolved with
+ * inexpensive memcmp() tie-breakers, even though abbreviated keys are of
+ * marginal utility.
+ */
+ if (norm_key_card < 0.05)
+ return false;
+
+ /*
+ * Abort abbreviation strategy.
+ *
+ * The worst case, where all abbreviated keys are identical while all
+ * original strings differ will typically only see a regression of about
+ * 10% in execution time for small to medium sized lists of strings.
+ * Whereas on modern CPUs where cache stalls are the dominant cost, we can
+ * often expect very large improvements, particularly with sets of strings
+ * of moderately high to high abbreviated cardinality. There is little to
+ * lose but much to gain, which our strategy reflects.
+ */
+ #ifdef DEBUG_ABBREV_KEYS
+ elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f",
+ memtupcount, abbrev_distinct, key_distinct);
+ /* Actually abort only when debugging is disabled */
+ return false;
+ #endif
+
+ return true;
+ }
+
Datum
text_larger(PG_FUNCTION_ARGS)
{
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
new file mode 100644
index 8e57505..b0041b9
*** a/src/backend/utils/sort/tuplesort.c
--- b/src/backend/utils/sort/tuplesort.c
*************** bool optimize_bounded_sort = true;
*** 150,156 ****
* When sorting single Datums, the data value is represented directly by
* datum1/isnull1. If the datatype is pass-by-reference and isnull1 is false,
* then datum1 points to a separately palloc'd data value that is also pointed
! * to by the "tuple" pointer; otherwise "tuple" is NULL.
*
* While building initial runs, tupindex holds the tuple's run number. During
* merge passes, we re-use it to hold the input tape number that each tuple in
--- 150,159 ----
* When sorting single Datums, the data value is represented directly by
* datum1/isnull1. If the datatype is pass-by-reference and isnull1 is false,
* then datum1 points to a separately palloc'd data value that is also pointed
! * to by the "tuple" pointer; otherwise "tuple" is NULL. There is one special
! * case: when the sort support infrastructure provides an "abbreviated key"
! * representation, where the key is (typically) a pass by value proxy for a
! * pass by reference type.
*
* While building initial runs, tupindex holds the tuple's run number. During
* merge passes, we re-use it to hold the input tape number that each tuple in
*************** struct Tuplesortstate
*** 353,358 ****
--- 356,370 ----
SortSupport onlyKey;
/*
+ * Additional state for managing "abbreviated key" sortsupport routines
+ * (currently only used in the MinimalTuple case). Tracks the intervals
+ * at which the optimization's effectiveness is tested.
+ */
+ int nextabbrev; /* Tuple # at which to next check applicability */
+ bool aborted; /* Abbreviation process aborted? */
+ double rowsHint; /* Hint # rows to be sorted, -1 if unknown */
+
+ /*
* These variables are specific to the CLUSTER case; they are set by
* tuplesort_begin_cluster. Note CLUSTER also uses tupDesc and
* indexScanKey.
*************** tuplesort_begin_heap(TupleDesc tupDesc,
*** 600,606 ****
int nkeys, AttrNumber *attNums,
Oid *sortOperators, Oid *sortCollations,
bool *nullsFirstFlags,
! int workMem, bool randomAccess)
{
Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
MemoryContext oldcontext;
--- 612,619 ----
int nkeys, AttrNumber *attNums,
Oid *sortOperators, Oid *sortCollations,
bool *nullsFirstFlags,
! int workMem, double planRows,
! bool randomAccess)
{
Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
MemoryContext oldcontext;
*************** tuplesort_begin_heap(TupleDesc tupDesc,
*** 632,637 ****
--- 645,652 ----
state->reversedirection = reversedirection_heap;
state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
+ state->nextabbrev = 10;
+ state->rowsHint = planRows;
/* Prepare SortSupport data for each column */
state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
*************** tuplesort_begin_heap(TupleDesc tupDesc,
*** 648,657 ****
sortKey->ssup_nulls_first = nullsFirstFlags[i];
sortKey->ssup_attno = attNums[i];
PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
}
! if (nkeys == 1)
state->onlyKey = state->sortKeys;
MemoryContextSwitchTo(oldcontext);
--- 663,687 ----
sortKey->ssup_nulls_first = nullsFirstFlags[i];
sortKey->ssup_attno = attNums[i];
+ /*
+ * Convey to sortsupport routine if abbreviatioin optimization is
+ * applicable in principle
+ */
+ if (i == 0)
+ sortKey->position = sortKeyAbbreviated;
+ else
+ sortKey->position = sortKeyOther;
+
PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
}
! /*
! * The "onlyKey" optimization cannot be used with abbreviated keys, since
! * tie-breaker comparisons may be required. Typically, the optimization is
! * only of value to pass-by-value types anyway, whereas abbreviated keys
! * are typically only of value to pass-by-reference types.
! */
! if (nkeys == 1 && !state->sortKeys->converter)
state->onlyKey = state->sortKeys;
MemoryContextSwitchTo(oldcontext);
*************** tuplesort_begin_datum(Oid datumType, Oid
*** 838,843 ****
--- 868,880 ----
/* Prepare SortSupport data */
state->onlyKey = (SortSupport) palloc0(sizeof(SortSupportData));
+ /*
+ * Conversion to abbreviated representation infeasible in the Datum case.
+ * It must be possible to subsequently fetch original datum values within
+ * tuplesort_getdatum(), which would require special-case preservation of
+ * original values that we prefer to avoid.
+ */
+ state->onlyKey->position = sortKeyOther;
state->onlyKey->ssup_cxt = CurrentMemoryContext;
state->onlyKey->ssup_collation = sortCollation;
state->onlyKey->ssup_nulls_first = nullsFirstFlag;
*************** tuplesort_set_bound(Tuplesortstate *stat
*** 886,891 ****
--- 923,938 ----
state->bounded = true;
state->bound = (int) bound;
+
+ /*
+ * Bounded sorts are not an effective target for abbreviated key
+ * optimization. Disable.
+ */
+ if (state->sortKeys && state->sortKeys->converter)
+ {
+ state->sortKeys = state->sortKeys->tiebreak;
+ state->sortKeys->position = sortKeyOther;
+ }
}
/*
*************** comparetup_heap(const SortTuple *a, cons
*** 2861,2872 ****
int nkey;
int32 compare;
! /* Compare the leading sort key */
! compare = ApplySortComparator(a->datum1, a->isnull1,
! b->datum1, b->isnull1,
! sortKey);
! if (compare != 0)
! return compare;
/* Compare additional sort keys */
ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
--- 2908,2922 ----
int nkey;
int32 compare;
! if (!state->aborted)
! {
! /* Compare the leading sort key */
! compare = ApplySortComparator(a->datum1, a->isnull1,
! b->datum1, b->isnull1,
! sortKey);
! if (compare != 0)
! return compare;
! }
/* Compare additional sort keys */
ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
*************** comparetup_heap(const SortTuple *a, cons
*** 2874,2879 ****
--- 2924,2958 ----
rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
tupDesc = state->tupDesc;
+
+ /*
+ * If a leading abbreviated comparison returned 0 or abbreviation strategy
+ * was abandoned, call tie-breaker comparator using original, authoritative
+ * representation, which may break tie when differences were not captured
+ * within abbreviated representation
+ */
+ if (state->sortKeys->converter)
+ {
+ AttrNumber attno = sortKey->ssup_attno;
+ Datum datum1,
+ datum2;
+ bool isnull1,
+ isnull2;
+
+ Assert(attno == sortKey->tiebreak->ssup_attno);
+ Assert(sortKey->position == sortKeyAbbreviated);
+ Assert(sortKey->tiebreak->position == sortKeyTiebreak);
+
+ datum1 = heap_getattr(<up, attno, tupDesc, &isnull1);
+ datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
+
+ compare = ApplySortComparator(datum1, isnull1,
+ datum2, isnull2,
+ sortKey->tiebreak);
+ if (compare != 0)
+ return compare;
+ }
+
sortKey++;
for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
{
*************** copytup_heap(Tuplesortstate *state, Sort
*** 2914,2923 ****
/* set up first-column key value */
htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
! stup->datum1 = heap_getattr(&htup,
! state->sortKeys[0].ssup_attno,
! state->tupDesc,
! &stup->isnull1);
}
static void
--- 2993,3035 ----
/* set up first-column key value */
htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
!
! /* Once aborted, we give up on storing anything in datum1 entirely */
! if (state->aborted)
! return;
!
! if (!state->sortKeys->converter)
! {
! /* Store ordinary Datum representation */
! stup->datum1 = heap_getattr(&htup,
! state->sortKeys[0].ssup_attno,
! state->tupDesc,
! &stup->isnull1);
! }
! else
! {
! Datum original;
!
! /* Store abbreviated key representation */
! original = heap_getattr(&htup, state->sortKeys[0].ssup_attno,
! state->tupDesc, &stup->isnull1);
!
! if (stup->isnull1)
! stup->datum1 = original;
! else
! stup->datum1 = state->sortKeys->converter(original,
! state->sortKeys);
!
! if (state->memtupcount >= state->nextabbrev)
! {
! /* Check effectiveness of optimization. Consider aborting. */
! state->nextabbrev *= 2;
! if (state->sortKeys->abort_conversion(state->memtupcount,
! state->rowsHint,
! state->sortKeys))
! state->aborted = true;
! }
! }
}
static void
*************** reversedirection_heap(Tuplesortstate *st
*** 2983,2988 ****
--- 3095,3109 ----
sortKey->ssup_reverse = !sortKey->ssup_reverse;
sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
}
+
+ if (state->sortKeys->tiebreak)
+ {
+ sortKey = state->sortKeys->tiebreak;
+
+ Assert(sortKey->position == sortKeyTiebreak);
+ sortKey->ssup_reverse = !sortKey->ssup_reverse;
+ sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
+ }
}
diff --git a/src/include/lib/hyperloglog.h b/src/include/lib/hyperloglog.h
new file mode 100644
index ...8a1e15f
*** a/src/include/lib/hyperloglog.h
--- b/src/include/lib/hyperloglog.h
***************
*** 0 ****
--- 1,67 ----
+ /*
+ * hyperloglog.h
+ *
+ * A simple HyperLogLog cardinality estimator implementation
+ *
+ * Portions Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ * Based on Hideaki Ohno's C++ implementation. The copyright term's of Ohno's
+ * original version (the MIT license) follow.
+ *
+ * src/include/lib/hyperloglog.h
+ */
+
+ /*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+ #ifndef HYPERLOGLOG_H
+ #define HYPERLOGLOG_H
+
+ /*
+ * HyperLogLog is an approximate technique for computing the number of distinct
+ * entries in a set. Importantly, it does this by using a fixed amount of
+ * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal
+ * cardinality estimation algorithm" for more.
+ *
+ * hyperLogLogState
+ *
+ * registerWidth register width, in bits ("k")
+ * nRegisters number of registers
+ * alphaMM alpha * m ^ 2 (see initHyperLogLog())
+ * hashesArr array of hashes
+ * arrSize size of hashesArr
+ */
+ typedef struct hyperLogLogState
+ {
+ uint8 registerWidth;
+ Size nRegisters;
+ double alphaMM;
+ uint8 *hashesArr;
+ Size arrSize;
+ } hyperLogLogState;
+
+ extern void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth);
+ extern void addHyperLogLog(hyperLogLogState *cState, uint32 hash);
+ extern double estimateHyperLogLog(hyperLogLogState *cState);
+ extern void mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState);
+
+ #endif /* HYPERLOGLOG_H */
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
new file mode 100644
index d78f38e..a27f5e3
*** a/src/include/pg_config_manual.h
--- b/src/include/pg_config_manual.h
***************
*** 222,234 ****
#endif
/*
! * Assumed cache line size. This doesn't affect correctness, but can be
! * used for low-level optimizations. Currently, this is only used to pad
! * some data structures in xlog.c, to ensure that highly-contended fields
! * are on different cache lines. Too small a value can hurt performance due
! * to false sharing, while the only downside of too large a value is a few
! * bytes of wasted memory. The default is 128, which should be large enough
! * for all supported platforms.
*/
#define CACHE_LINE_SIZE 128
--- 222,234 ----
#endif
/*
! * Assumed cache line size. This doesn't affect correctness, but can be used
! * for low-level optimizations. Currently, this is used to pad some data
! * structures in xlog.c, to ensure that highly-contended fields are on
! * different cache lines. Too small a value can hurt performance due to false
! * sharing, while the only downside of too large a value is a few bytes of
! * wasted memory. The default is 128, which should be large enough for all
! * supported platforms.
*/
#define CACHE_LINE_SIZE 128
diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h
new file mode 100644
index 8b6b0de..e7cc0b8
*** a/src/include/utils/sortsupport.h
--- b/src/include/utils/sortsupport.h
***************
*** 21,27 ****
* required to provide all of them. The BTSORTSUPPORT function should
* simply not set any function pointers for mechanisms it doesn't support.
* (However, all opclasses that provide BTSORTSUPPORT are required to provide
! * the comparator function.)
*
* All sort support functions will be passed the address of the
* SortSupportData struct when called, so they can use it to store
--- 21,29 ----
* required to provide all of them. The BTSORTSUPPORT function should
* simply not set any function pointers for mechanisms it doesn't support.
* (However, all opclasses that provide BTSORTSUPPORT are required to provide
! * the comparator function, and opclasses that support the optional additional
! * abbreviated key capability must provide a tiebreaker state, converter and a
! * comparator that works with the abbreviated representation.)
*
* All sort support functions will be passed the address of the
* SortSupportData struct when called, so they can use it to store
***************
*** 49,54 ****
--- 51,63 ----
#include "access/attnum.h"
+ typedef enum
+ {
+ sortKeyOther, /* Second/subsequent key, or no abbreviation capability */
+ sortKeyAbbreviated, /* Leading (abbreviated applicable) key (when available) */
+ sortKeyTiebreak, /* "True" leading key, abbreviation tie-breaker */
+ } SortKeyPosition;
+
typedef struct SortSupportData *SortSupport;
typedef struct SortSupportData
*************** typedef struct SortSupportData
*** 92,103 ****
* than, equal to, or greater than y. Note that x and y are guaranteed
* not null, and there is no way to return null either. Do not return
* INT_MIN, as callers are allowed to negate the result before using it.
*/
int (*comparator) (Datum x, Datum y, SortSupport ssup);
/*
! * Additional sort-acceleration functions might be added here later.
*/
} SortSupportData;
--- 101,203 ----
* than, equal to, or greater than y. Note that x and y are guaranteed
* not null, and there is no way to return null either. Do not return
* INT_MIN, as callers are allowed to negate the result before using it.
+ *
+ * This comparator may require a tie-breaker for opclasses with additional
+ * special support for dealing with an abbreviated key representation.
*/
int (*comparator) (Datum x, Datum y, SortSupport ssup);
/*
! * "Abbreviated key" infrastructure follows. All callbacks must be set by
! * sortsupport opclasses that make use of this optional additional
! * infrastructure.
! *
! * This allows opclass authors to supply a conversion routine, used to
! * create an alternative representation of the underlying type (an
! * "abbreviated key"). Typically, this representation is an ad-hoc,
! * pass-by-value Datum format that only the opclass has knowledge of. An
! * alternative comparator, used only with this alternative representation
! * must also be provided. This representation is a simple approximation of
! * the original Datum. It must be possible to compare datums of this
! * representation with each other using the supplied alternative
! * comparator, and have any non-zero return value be a reliable proxy for
! * what a proper comparison would indicate. Returning zero from the
! * alternative comparator does not indicate equality, as with a
! * conventional support routine 1, though -- it indicates that it wasn't
! * possible to determine how the two abbreviated values compared. A proper
! * comparison is therefore required. In many cases this results in most or
! * all comparisons only using the cheap alternative comparison func, which
! * is typically implemented as code that compiles to just a few CPU
! * instructions. The technique is particularly useful for internal sorts
! * (i.e. quicksort). CPU cache miss penalties have grown to the point
! * where good overall performance must heavily weigh cache performance.
! *
! * Opclass authors must consider the final cardinality of abbreviated keys
! * when devising an encoding scheme. It's possible for a strategy to work
! * better than an alternative strategy with one usage pattern, while the
! * reverse might be true for another usage pattern. All of these factors
! * should be weighed.
! */
!
! /*
! * Sort key "position" mostly just relates to whether or not the
! * abbreviated key optimization is applicable in principle (that is, the
! * sortsupport routine needs to know if its dealing with a leading key).
! * Even with a leading key, internal sortsupport clients like tuplesort may
! * represent it as sortKeyOther because it isn't feasible to inject the
! * conversion routine. However, sortKeyTiebreak means that it's a "proper"
! * sortsupport state, originally generated by the sortsupport routine
! * itself - the core system will never directly create a sort support state
! * with tie-break position. There is very little distinction between
! * sortKeyTiebreak and sortKeyOther positions, though. The distinction
! * only exists to allow sortsupport routines to squeeze a bit more
! * performance from the knowledge that an authoritative tie-breaker
! * comparison is required because a prior alternative comparison didn't
! * work out (as opposed to being called without there ever being an
! * abbreviated comparison of the tuple attribute).
! *
! * A sortsupport routine may initially decide against applying the
! * abbreviation optimization by setting a passed sortKeyAbbreviated sort
! * support state's position to sortKeyOther. This is typically used for
! * platform-specific special cases where the optimization is thought to be
! * ineffective.
! */
! SortKeyPosition position;
!
! /*
! * Converter to abbreviated format, from original representation. Core
! * code uses this callback to convert from a pass-by-reference "original"
! * Datum to a pass-by-value abbreviated key Datum. Note that original is
! * guaranteed not null because it doesn't make sense to factor NULL values
! * into ad-hoc cost model.
! */
! Datum (*converter) (Datum original, SortSupport ssup);
!
! /*
! * This callback allows clients to verify that the current strategy is
! * working out, using a sortsupport routine defined ad-hoc cost model. If
! * there is a lot of duplicate abbreviated keys in practice, it's useful to
! * be able to abandon the strategy before paying too high a cost in
! * conversion. rowhint is typically an estimate of total rows that will be
! * sorted originating from the planner, but clients are at liberty to
! * provide a new hint with each call, should better information become
! * available during the course of conversion. (By convention, negative
! * rowhint values are interpreted as "no hint available".)
! */
! bool (*abort_conversion) (int memtupcount, double rowhint,
! SortSupport ssup);
!
! /*
! * Alternative tiebreak SortSupport state for leading (abbreviated) key,
! * used only when alternative comparator returned 0, and the core system
! * must use this separate state to perform an authoritative comparison.
! * This relates to the same attribute as our ssup_attno, but code like
! * tuplesort is required to call it directly as a tie-breaker.
! *
! * It is only ever initialized by a SortSupport routine with abbreviation
! * support, and not any internal code.
*/
+ SortSupport tiebreak;
} SortSupportData;
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
new file mode 100644
index 2537883..e5de4f4
*** a/src/include/utils/tuplesort.h
--- b/src/include/utils/tuplesort.h
*************** extern Tuplesortstate *tuplesort_begin_h
*** 62,68 ****
int nkeys, AttrNumber *attNums,
Oid *sortOperators, Oid *sortCollations,
bool *nullsFirstFlags,
! int workMem, bool randomAccess);
extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
Relation indexRel,
int workMem, bool randomAccess);
--- 62,68 ----
int nkeys, AttrNumber *attNums,
Oid *sortOperators, Oid *sortCollations,
bool *nullsFirstFlags,
! int workMem, double planRows, bool randomAccess);
extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
Relation indexRel,
int workMem, bool randomAccess);
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers