Re: [HACKERS] B-Tree support function number 3 (strxfrm() optimization)

Peter Geoghegan Fri, 22 Aug 2014 11:48:03 -0700

On Fri, Aug 22, 2014 at 7:19 AM, Robert Haas <robertmh...@gmail.com> wrote:
> Patch 0002 no longer applies; please rebase.


I attach rebased patch.

Note that there is currently a bug in the master branch:

+   if (len2 >= tss->buflen2)
+   {
+       pfree(tss->buf2);
+       tss->buflen1 = Max(len2 + 1, Min(tss->buflen2 * 2, MaxAllocSize));
+       tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2);
+   }

Thanks
-- 
Peter Geoghegan

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
new file mode 100644
index c09ca7e..adf4b8c
*** a/src/backend/commands/analyze.c
--- b/src/backend/commands/analyze.c
*************** compute_scalar_stats(VacAttrStatsP stats
*** 2292,2297 ****
--- 2292,2303 ----
  	/* We always use the default collation for statistics */
  	ssup.ssup_collation = DEFAULT_COLLATION_OID;
  	ssup.ssup_nulls_first = false;
+ 	/*
+ 	 * For now, don't perform abbreviated key conversion, because full values
+ 	 * are required for MCV slot generation.  Supporting that optimization
+ 	 * would necessitate teaching compare_scalars() to call a tie-breaker.
+ 	 */
+ 	ssup.position = sortKeyOther;
  
  	PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup);
  
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
new file mode 100644
index 510d1c5..3b335e7
*** a/src/backend/executor/nodeAgg.c
--- b/src/backend/executor/nodeAgg.c
*************** initialize_aggregates(AggState *aggstate
*** 346,351 ****
--- 346,352 ----
  	{
  		AggStatePerAgg peraggstate = &peragg[aggno];
  		AggStatePerGroup pergroupstate = &pergroup[aggno];
+ 		Agg		   		*node = (Agg *) aggstate->ss.ps.plan;
  
  		/*
  		 * Start a fresh sort operation for each DISTINCT/ORDER BY aggregate.
*************** initialize_aggregates(AggState *aggstate
*** 363,368 ****
--- 364,373 ----
  			 * We use a plain Datum sorter when there's a single input column;
  			 * otherwise sort the full tuple.  (See comments for
  			 * process_ordered_aggregate_single.)
+ 			 *
+ 			 * FIXME: It ought to be useful to force tuplesort_begin_heap()
+ 			 * case where the abbreviated key optimization can thereby be used,
+ 			 * even when numInputs == 1.
  			 */
  			peraggstate->sortstate =
  				(peraggstate->numInputs == 1) ?
*************** initialize_aggregates(AggState *aggstate
*** 377,383 ****
  									 peraggstate->sortOperators,
  									 peraggstate->sortCollations,
  									 peraggstate->sortNullsFirst,
! 									 work_mem, false);
  		}
  
  		/*
--- 382,390 ----
  									 peraggstate->sortOperators,
  									 peraggstate->sortCollations,
  									 peraggstate->sortNullsFirst,
! 									 work_mem,
! 									 node->plan.plan_rows,
! 									 false);
  		}
  
  		/*
diff --git a/src/backend/executor/nodeMergeAppend.c b/src/backend/executor/nodeMergeAppend.c
new file mode 100644
index 47ed068..677a953
*** a/src/backend/executor/nodeMergeAppend.c
--- b/src/backend/executor/nodeMergeAppend.c
*************** ExecInitMergeAppend(MergeAppend *node, E
*** 137,142 ****
--- 137,151 ----
  		sortKey->ssup_nulls_first = node->nullsFirst[i];
  		sortKey->ssup_attno = node->sortColIdx[i];
  
+ 		/*
+ 		 * It isn't feasible to perform abbreviated key conversion, since
+ 		 * tuples are pulled into mergestate's binary heap as needed.  It would
+ 		 * likely be counter-productive to convert tuples into an abbreviated
+ 		 * representation as they're pulled up, so opt out of that additional
+ 		 * optimization entirely.
+ 		 */
+ 		sortKey->position = sortKeyOther;
+ 
  		PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey);
  	}
  
diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c
new file mode 100644
index fdf2f4c..39f0ff1
*** a/src/backend/executor/nodeMergejoin.c
--- b/src/backend/executor/nodeMergejoin.c
*************** MJExamineQuals(List *mergeclauses,
*** 229,234 ****
--- 229,242 ----
  			elog(ERROR, "cannot merge using non-equality operator %u",
  				 qual->opno);
  
+ 		/*
+ 		 * sortsupport routine must know if abbreviation optimization is
+ 		 * applicable in principle.  It is never applicable for merge joins
+ 		 * because there is no convenient opportunity to convert to alternative
+ 		 * representation.  Only elide fmgr trampoline where supported.
+ 		 */
+ 		clause->ssup.position = sortKeyOther;
+ 
  		/* And get the matching support or comparison function */
  		Assert(clause->ssup.comparator == NULL);
  		sortfunc = get_opfamily_proc(opfamily,
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
new file mode 100644
index b88571b..31d3ead
*** a/src/backend/executor/nodeSort.c
--- b/src/backend/executor/nodeSort.c
*************** ExecSort(SortState *node)
*** 89,94 ****
--- 89,95 ----
  											  plannode->collations,
  											  plannode->nullsFirst,
  											  work_mem,
+ 											  plannode->plan.plan_rows,
  											  node->randomAccess);
  		if (node->bounded)
  			tuplesort_set_bound(tuplesortstate, node->bound);
diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile
new file mode 100644
index 327a1bc..14f9a46
*** a/src/backend/lib/Makefile
--- b/src/backend/lib/Makefile
*************** subdir = src/backend/lib
*** 12,17 ****
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = ilist.o binaryheap.o stringinfo.o
  
  include $(top_srcdir)/src/backend/common.mk
--- 12,17 ----
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = ilist.o binaryheap.o hyperloglog.o stringinfo.o
  
  include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c
new file mode 100644
index ...4db42bf
*** a/src/backend/lib/hyperloglog.c
--- b/src/backend/lib/hyperloglog.c
***************
*** 0 ****
--- 1,228 ----
+ /*-------------------------------------------------------------------------
+  *
+  * hyperloglog.c
+  *	  HyperLogLog cardinality estimator
+  *
+  * Portions Copyright (c) 2014, PostgreSQL Global Development Group
+  *
+  * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
+  * suited to estimating the cardinality of very large sets;  in particular, we
+  * have not attempted to further optimize the implementation as described in
+  * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+  * Engineering of a State of The Art Cardinality Estimation Algorithm".
+  *
+  * A sparse representation of HyperLogLog state is used, with fixed space
+  * overhead.
+  *
+  * The copyright term's of Ohno's original version (the MIT license) follow.
+  *
+  * IDENTIFICATION
+  *	  src/backend/lib/hyperloglog.c
+  *
+  *-------------------------------------------------------------------------
+  */
+ 
+ /*
+  * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a copy
+  * of this software and associated documentation files (the 'Software'), to
+  * deal in the Software without restriction, including without limitation the
+  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  * sell copies of the Software, and to permit persons to whom the Software is
+  * furnished to do so, subject to the following conditions:
+  *
+  * The above copyright notice and this permission notice shall be included in
+  * all copies or substantial portions of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  * IN THE SOFTWARE.
+  */
+ 
+ #include "postgres.h"
+ 
+ #include <math.h>
+ 
+ #include "lib/hyperloglog.h"
+ 
+ #define POW_2_32			(4294967296.0)
+ #define NEG_POW_2_32		(-4294967296.0)
+ 
+ static inline uint8 rho(uint32 x, uint8 b);
+ 
+ /*
+  * Initialize HyperLogLog track state
+  *
+  * bwidth is bit width (so register size will be 2 to the power of bwidth).
+  * Must be between 4 and 16 inclusive.
+  */
+ void
+ initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
+ {
+ 	double		alpha;
+ 
+ 	if (bwidth < 4 || bwidth > 16)
+ 		elog(ERROR, "bit width must be between 4 and 16 inclusive");
+ 
+ 	cState->registerWidth = bwidth;
+ 	cState->nRegisters = 1 << bwidth;
+ 	cState->arrSize = sizeof(uint8) * cState->nRegisters + 1;
+ 
+ 	/*
+ 	 * Initialize hashes array to zero, not negative infinity, per discussion
+ 	 * of the coupon collector problem in the HyperLogLog paper
+ 	 */
+ 	cState->hashesArr = palloc0(cState->arrSize);
+ 
+ 	/*
+ 	 * "alpha" is a value that for each possible number of registers (m) is
+ 	 * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z
+ 	 * is "the indicator function" through which we finally compute E,
+ 	 * estimated cardinality).
+ 	 */
+ 	switch (cState->nRegisters)
+ 	{
+ 		case 16:
+ 			alpha = 0.673;
+ 			break;
+ 		case 32:
+ 			alpha = 0.697;
+ 			break;
+ 		case 64:
+ 			alpha = 0.709;
+ 			break;
+ 		default:
+ 			alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters);
+ 	}
+ 
+ 	/*
+ 	 * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog
+ 	 * estimate E
+ 	 */
+ 	cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters;
+ }
+ 
+ /*
+  * Adds element to the estimator, from caller-supplied hash.
+  *
+  * It is critical that the hash value passed be an actual hash value, typically
+  * generated using hash_any().  The algorithm relies on a specific bit-pattern
+  * observable in conjunction with stochastic averaging.  There must be a
+  * uniform distribution of bits in hash values for each distinct original value
+  * observed.
+  */
+ void
+ addHyperLogLog(hyperLogLogState *cState, uint32 hash)
+ {
+ 	uint8		count;
+ 	uint32		index;
+ 
+ 	/* Use the first "k" (registerWidth) bits as a zero based index */
+ 	index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+ 
+ 	/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
+ 	count = rho(hash << cState->registerWidth,
+ 				BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+ 
+ 	cState->hashesArr[index] = Max(count, cState->hashesArr[index]);
+ }
+ 
+ /*
+  * Estimates cardinality, based on elements added so far
+  */
+ double
+ estimateHyperLogLog(hyperLogLogState *cState)
+ {
+ 	double		result;
+ 	double		sum = 0.0;
+ 	int			i;
+ 
+ 	for (i = 0; i < cState->nRegisters; i++)
+ 	{
+ 		sum += 1.0 / pow(2.0, cState->hashesArr[i]);
+ 	}
+ 
+ 	/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
+ 	result = cState->alphaMM / sum;
+ 
+ 	if (result <= (5.0 / 2.0) * cState->nRegisters)
+ 	{
+ 		/* Small range correction */
+ 		int 	zero_count = 0;
+ 
+ 		for (i = 0; i < cState->nRegisters; i++)
+ 		{
+ 			if (cState->hashesArr[i] == 0)
+ 				zero_count++;
+ 		}
+ 
+ 		if (zero_count != 0)
+ 			result = cState->nRegisters * log((double) cState->nRegisters /
+ 											  zero_count);
+ 	}
+ 	else if (result > (1.0 / 30.0) * POW_2_32)
+ 	{
+ 		/* Large range correction */
+ 		result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
+ 	}
+ 
+ 	return result;
+ }
+ 
+ /*
+  * Merges the estimate from one HyperLogLog state to another, returning the
+  * estimate of their union.
+  *
+  * The number of registers in each must match.
+  */
+ void
+ mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState)
+ {
+ 	int		r;
+ 
+ 	if (cState->nRegisters != oState->nRegisters)
+ 		elog(ERROR, "number of registers mismatch: %zu != %zu",
+ 			 cState->nRegisters, oState->nRegisters);
+ 
+ 	for (r = 0; r < cState->nRegisters; ++r)
+ 	{
+ 		cState->hashesArr[r] = Max(cState->hashesArr[r], oState->hashesArr[r]);
+ 	}
+ }
+ 
+ 
+ /*
+  * Worker for addHyperLogLog().
+  *
+  * Calculates the position of the first set bit in first b bits of x argument
+  * starting from the first, reading from most significant to least significant
+  * bits.
+  *
+  * Example (when considering fist 10 bits of x):
+  *
+  * rho(x = 0b1000000000)   returns 1
+  * rho(x = 0b0010000000)   returns 3
+  * rho(x = 0b0000000000)   returns b + 1
+  *
+  * "The binary address determined by the first b bits of x"
+  *
+  * Return value "j" used to index bit pattern to watch.
+  */
+ static inline uint8
+ rho(uint32 x, uint8 b)
+ {
+ 	uint8	j = 1;
+ 
+ 	while (j <= b && !(x & 0x80000000))
+ 	{
+ 		j++;
+ 		x <<= 1;
+ 	}
+ 
+ 	return j;
+ }
diff --git a/src/backend/utils/adt/orderedsetaggs.c b/src/backend/utils/adt/orderedsetaggs.c
new file mode 100644
index 135a14b..0a36c84
*** a/src/backend/utils/adt/orderedsetaggs.c
--- b/src/backend/utils/adt/orderedsetaggs.c
*************** ordered_set_startup(FunctionCallInfo fci
*** 274,280 ****
  												   qstate->sortOperators,
  												   qstate->sortCollations,
  												   qstate->sortNullsFirsts,
! 												   work_mem, false);
  	else
  		osastate->sortstate = tuplesort_begin_datum(qstate->sortColType,
  													qstate->sortOperator,
--- 274,280 ----
  												   qstate->sortOperators,
  												   qstate->sortCollations,
  												   qstate->sortNullsFirsts,
! 												   work_mem, -1, false);
  	else
  		osastate->sortstate = tuplesort_begin_datum(qstate->sortColType,
  													qstate->sortOperator,
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
new file mode 100644
index 1bd8abb..96df4a9
*** a/src/backend/utils/adt/varlena.c
--- b/src/backend/utils/adt/varlena.c
***************
*** 17,25 ****
--- 17,27 ----
  #include <ctype.h>
  #include <limits.h>
  
+ #include "access/hash.h"
  #include "access/tuptoaster.h"
  #include "catalog/pg_collation.h"
  #include "catalog/pg_type.h"
+ #include "lib/hyperloglog.h"
  #include "libpq/md5.h"
  #include "libpq/pqformat.h"
  #include "miscadmin.h"
***************
*** 32,37 ****
--- 34,42 ----
  #include "utils/pg_locale.h"
  #include "utils/sortsupport.h"
  
+ #ifdef DEBUG_ABBREV_KEYS
+ #define DEBUG_elog_output	DEBUG1
+ #endif
  
  /* GUC variable */
  int			bytea_output = BYTEA_OUTPUT_HEX;
*************** typedef struct
*** 54,63 ****
  
  typedef struct
  {
! 	char			   *buf1;		/* 1st string */
! 	char			   *buf2;		/* 2nd string */
  	int					buflen1;
  	int					buflen2;
  #ifdef HAVE_LOCALE_T
  	pg_locale_t locale;
  #endif
--- 59,70 ----
  
  typedef struct
  {
! 	char			   *buf1;		/* 1st string, or abbreviation original string buf */
! 	char			   *buf2;		/* 2nd string, or abbreviation strxfrm() buf */
  	int					buflen1;
  	int					buflen2;
+ 	hyperLogLogState	abbr_card;	/* Abbreviated key cardinality state */
+ 	hyperLogLogState	full_card;	/* Full key cardinality state */
  #ifdef HAVE_LOCALE_T
  	pg_locale_t locale;
  #endif
*************** typedef struct
*** 75,83 ****
--- 82,98 ----
  #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
  #define PG_RETURN_UNKNOWN_P(x)		PG_RETURN_POINTER(x)
  
+ /*
+  * Used for calculating number of sort comparisons
+  */
+ #define LOG2(x)  (log(x) / 0.693147180559945)
+ 
  static void btsortsupport_worker(SortSupport ssup, Oid collid);
+ static int bttextcmp_abbreviated(Datum x, Datum y, SortSupport ssup);
  static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup);
  static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup);
+ static Datum bttext_convert(Datum original, SortSupport ssup);
+ static bool bttext_abort(int memtupcount, double rowhint, SortSupport ssup);
  static int32 text_length(Datum str);
  static text *text_catenate(text *t1, text *t2);
  static text *text_substring(Datum str,
*************** btsortsupport_worker(SortSupport ssup, O
*** 1725,1750 ****
  	TextSortSupport	   *tss;
  
  	/*
- 	 * If LC_COLLATE = C, we can make things quite a bit faster by using
- 	 * memcmp() rather than strcoll().  To minimize the per-comparison
- 	 * overhead, we make this decision just once for the whole sort.
- 	 */
- 	if (lc_collate_is_c(collid))
- 	{
- 		ssup->comparator = bttextfastcmp_c;
- 		return;
- 	}
- 
- 	/*
  	 * WIN32 requires complex hacks when the database encoding is UTF-8 (except
  	 * when using the "C" collation).  For now, we don't optimize that case.
  	 */
  #ifdef WIN32
! 	if (GetDatabaseEncoding() == PG_UTF8)
  		return;
  #endif
  
  	/*
  	 * We may need a collation-sensitive comparison.  To make things faster,
  	 * we'll figure out the collation based on the locale id and cache the
  	 * result.  Also, since strxfrm()/strcoll() require NUL-terminated inputs,
--- 1740,1811 ----
  	TextSortSupport	   *tss;
  
  	/*
  	 * WIN32 requires complex hacks when the database encoding is UTF-8 (except
  	 * when using the "C" collation).  For now, we don't optimize that case.
  	 */
  #ifdef WIN32
! 	if (GetDatabaseEncoding() == PG_UTF8 && !lc_collate_is_c(collid))
  		return;
  #endif
  
  	/*
+ 	 * On platforms where the abbreviated key for text optimization might have
+ 	 * bad worst case performance it is avoided entirely.  In order to ever
+ 	 * apply the optimization we require:
+ 	 *
+ 	 * 	* That the platform's strxfrm() meets a certain standard for
+ 	 * 	representing as much information as possible in leading bytes.
+ 	 *
+ 	 *	* That there are a full 8 bytes of storage per Datum on the platform,
+ 	 *	since we pack bytes into that representation.  Having only 4 bytes
+ 	 *	could make worse case performance drastically more likely.
+ 	 *
+ 	 * The standard applied for strxfrm() is that a significant amount of
+ 	 * entropy from the original string must be concentrated at the beginning
+ 	 * of returned blobs.  The Mac OS X implementation is known to produce
+ 	 * blobs with very little entropy;  it produces printable blobs with a
+ 	 * series of 24-bit weights, each one encoded into four bytes.  Multiple
+ 	 * "levels" of weights are separated by a weight of 0.  Typically only two
+ 	 * ASCII characters are represented in the first eight bytes.  Therefore,
+ 	 * the optimization is specially disabled.
+ 	 *
+ 	 * Any reasonable implementation will pack primary weights into the start
+ 	 * of returned blobs.  The canonical algorithm's implementation is
+ 	 * discussed by Unicode Technical Standard #10 ("UNICODE COLLATION
+ 	 * ALGORITHM"), section 4, "Main algorithm".  Section 4.3, "Form Sort Key"
+ 	 * is of particular interest:
+ 	 *
+ 	 * http://www.unicode.org/reports/tr10/#Step_3
+ 	 *
+ 	 * The collation algorithm standard goes on to state:
+ 	 *
+ 	 * "By default, the algorithm makes use of three fully-customizable levels.
+ 	 * For the Latin script, these levels correspond roughly to:
+ 	 *
+ 	 * alphabetic ordering
+ 	 *
+ 	 * diacritic ordering
+ 	 *
+ 	 * case ordering.
+ 	 *
+ 	 * A final level may be used for tie-breaking between strings not otherwise
+ 	 * distinguished."
+ 	 *
+ 	 * It is generally expected that most non-equal keys will have their
+ 	 * comparisons resolved at the primary level.  If enough comparisons can be
+ 	 * resolved with just 8 byte abbreviated keys, this optimization is very
+ 	 * effective (although if there are many tie-breakers that largely only
+ 	 * perform cheap memcmp() calls, that is also much faster than the
+ 	 * unoptimized case - see bttext_abort()).
+ 	 *
+ 	 * There is no reason to not at least perform fmgr elision on platforms
+ 	 * where abbreviation isn't expected to be profitable, though.
+ 	 */
+ #if defined(__darwin__) || SIZEOF_DATUM != 8
+ 	ssup->type = sortKeyOther;
+ #endif
+ 
+ 	/*
  	 * We may need a collation-sensitive comparison.  To make things faster,
  	 * we'll figure out the collation based on the locale id and cache the
  	 * result.  Also, since strxfrm()/strcoll() require NUL-terminated inputs,
*************** btsortsupport_worker(SortSupport ssup, O
*** 1777,1789 ****
  #endif
  	}
  
  	tss->buf1 = palloc(TEXTBUFLEN);
  	tss->buflen1 = TEXTBUFLEN;
  	tss->buf2 = palloc(TEXTBUFLEN);
  	tss->buflen2 = TEXTBUFLEN;
  
  	ssup->ssup_extra = tss;
! 	ssup->comparator = bttextfastcmp_locale;
  }
  
  /*
--- 1838,1922 ----
  #endif
  	}
  
+ 	if (ssup->position != sortKeyAbbreviated)
+ 	{
+ 		/*
+ 		 * If LC_COLLATE = C, we can make things quite a bit faster by using
+ 		 * memcmp() rather than strcoll().  To minimize the per-comparison
+ 		 * overhead, we make this decision just once for the whole sort.
+ 		 */
+ 		if (lc_collate_is_c(collid))
+ 		{
+ 			ssup->comparator = bttextfastcmp_c;
+ 		}
+ 		else
+ 		{
+ 			ssup->comparator = bttextfastcmp_locale;
+ 
+ 			/* locale case requires temp buffers */
+ 			tss->buf1 = palloc(TEXTBUFLEN);
+ 			tss->buflen1 = TEXTBUFLEN;
+ 			tss->buf2 = palloc(TEXTBUFLEN);
+ 			tss->buflen2 = TEXTBUFLEN;
+ 			ssup->ssup_extra = tss;
+ 		}
+ 
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Abbreviated case requires temp buffers for strxfrm() copying, and in the
+ 	 * *_locale case possibly for eventual tie-breaker comparisons during
+ 	 * sorting proper
+ 	 */
  	tss->buf1 = palloc(TEXTBUFLEN);
  	tss->buflen1 = TEXTBUFLEN;
  	tss->buf2 = palloc(TEXTBUFLEN);
  	tss->buflen2 = TEXTBUFLEN;
  
  	ssup->ssup_extra = tss;
! 
! 	initHyperLogLog(&tss->abbr_card, 10);
! 	initHyperLogLog(&tss->full_card, 10);
! 
! 	ssup->comparator = bttextcmp_abbreviated;
! 	ssup->converter = bttext_convert;
! 	ssup->abort_conversion = bttext_abort;
! 
! 	ssup->tiebreak = palloc0(sizeof(SortSupportData));
! 	ssup->tiebreak->ssup_cxt = ssup->ssup_cxt;
! 	ssup->tiebreak->ssup_collation = ssup->ssup_collation;
! 	ssup->tiebreak->ssup_reverse = ssup->ssup_reverse;
! 	ssup->tiebreak->ssup_nulls_first = ssup->ssup_nulls_first;
! 	ssup->tiebreak->ssup_attno = ssup->ssup_attno;
! 
! 	/*
! 	 * Initialize tiebreak sortsupport state with an authoritative
! 	 * strcoll()-based comparison func for tie-breaking.
! 	 */
! 	ssup->tiebreak->position = sortKeyTiebreak;
! 	btsortsupport_worker(ssup->tiebreak, collid);
! }
! 
! /*
!  * Abbreviated key comparison func
!  */
! static int
! bttextcmp_abbreviated(Datum x, Datum y, SortSupport ssup)
! {
! 	char   *a = (char *) &x;
! 	char   *b = (char *) &y;
! 	int 	result;
! 
! 	result = memcmp(a, b, sizeof(Datum));
! 
! 	/*
! 	 * When result = 0, the core system will call bttextfastcmp_c() or
! 	 * bttextfastcmp_locale().  Even a strcmp() on two non-truncated strxfrm()
! 	 * blobs cannot indicate *equality* authoritatively, for the same reason
! 	 * that there is a strcoll() strcmp() tie-breaker in varstr_cmp().
! 	 */
! 	return result;
  }
  
  /*
*************** bttextfastcmp_locale(Datum x, Datum y, S
*** 1842,1847 ****
--- 1975,2008 ----
  	len1 = VARSIZE_ANY_EXHDR(arg1);
  	len2 = VARSIZE_ANY_EXHDR(arg2);
  
+ 	if (ssup->position == sortKeyTiebreak && len1 == len2)
+ 	{
+ 		/*
+ 		 * Being called as authoritative tie-breaker for an abbreviated key
+ 		 * comparison.
+ 		 *
+ 		 * TODO:  Consider using this optimization more frequently, perhaps
+ 		 * even entirely opportunistically.
+ 		 *
+ 		 * In general there is a pretty good chance control reached here
+ 		 * because the two keys are actually fully equal.  Try and give an
+ 		 * answer using only a cheap memcmp() comparison on the assumption that
+ 		 * this will indicate equality frequently enough for it to be worth it
+ 		 * on balance.  This is a reasonable assumption, since sorting is
+ 		 * almost certainly bottlenecked on memory latency.
+ 		 *
+ 		 * Original, authoritative key cardinality is weighed within
+ 		 * bttext_abort().  Cases where attempts at resolving tie-breakers in
+ 		 * this manner are the usual outcome, and yet those attempts usually
+ 		 * fail should only arise infrequently.
+ 		 */
+ 		if (memcmp(a1p, a2p, len1) == 0)
+ 		{
+ 			result = 0;
+ 			goto done;
+ 		}
+ 	}
+ 
  	if (len1 >= tss->buflen1)
  	{
  		pfree(tss->buf1);
*************** bttextfastcmp_locale(Datum x, Datum y, S
*** 1851,1857 ****
  	if (len2 >= tss->buflen2)
  	{
  		pfree(tss->buf2);
! 		tss->buflen1 = Max(len2 + 1, Min(tss->buflen2 * 2, MaxAllocSize));
  		tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2);
  	}
  
--- 2012,2018 ----
  	if (len2 >= tss->buflen2)
  	{
  		pfree(tss->buf2);
! 		tss->buflen2 = Max(len2 + 1, Min(tss->buflen2 * 2, MaxAllocSize));
  		tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2);
  	}
  
*************** bttextfastcmp_locale(Datum x, Datum y, S
*** 1875,1880 ****
--- 2036,2043 ----
  	if (result == 0)
  		result = strcmp(tss->buf1, tss->buf2);
  
+ done:
+ 
  	/* We can't afford to leak memory here. */
  	if (PointerGetDatum(arg1) != x)
  		pfree(arg1);
*************** bttextfastcmp_locale(Datum x, Datum y, S
*** 1884,1889 ****
--- 2047,2324 ----
  	return result;
  }
  
+ /*
+  * Conversion routine for sortsupport.  Converts original text to abbreviated
+  * key representation.  Our encoding strategy is simple -- pack the first 8
+  * bytes of a strxfrm() blob into a Datum.
+  */
+ static Datum
+ bttext_convert(Datum original, SortSupport ssup)
+ {
+ 	TextSortSupport	   *tss = (TextSortSupport *) ssup->ssup_extra;
+ 	text			   *authoritative = DatumGetTextPP(original);
+ 
+ 	/* working state */
+ 	Datum				res;
+ 	char			   *pres;
+ 	int					len;
+ 	Size				bsize;
+ 	uint32				lohalf,
+ 						hihalf,
+ 						hash;
+ 
+ 	/*
+ 	 * Abbreviated key representation is a pass-by-value Datum that is treated
+ 	 * as a char array by the specialized comparator bttextcmp_abbreviated().
+ 	 */
+ 	pres = (char *) &res;
+ 	/* memset() so untouched bytes are NULL */
+ 	memset(pres, 0, sizeof(Datum));
+ 	len = VARSIZE_ANY_EXHDR(authoritative);
+ 
+ 	/* By convention, we use buffer 1 to store and NULL terminate text */
+ 	if (len >= tss->buflen1)
+ 	{
+ 		pfree(tss->buf1);
+ 		tss->buflen1 = Max(len + 1, Min(tss->buflen1 * 2, MaxAllocSize));
+ 		tss->buf1 = palloc(tss->buflen1);
+ 	}
+ 
+ 	/* Just like strcoll(), strxfrm() expects a NULL-terminated string */
+ 	memcpy(tss->buf1, VARDATA_ANY(authoritative), len);
+ 	tss->buf1[len] = '\0';
+ 
+ 	/* Don't leak memory here */
+ 	if (PointerGetDatum(authoritative) != original)
+ 		pfree(authoritative);
+ 
+ retry:
+ 
+ 	/*
+ 	 * There is no special handling of the C locale here.  strxfrm() is used
+ 	 * indifferently.
+ 	 */
+ #ifdef HAVE_LOCALE_T
+ 	if (tss->locale)
+ 		bsize = strxfrm_l(tss->buf2, tss->buf1, tss->buflen2, tss->locale);
+ 	else
+ #endif
+ 		bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2);
+ 
+ 	if (bsize >= tss->buflen2)
+ 	{
+ 		/*
+ 		 * The C standard states that the contents of the buffer is now
+ 		 * unspecified.  Grow buffer, and retry.
+ 		 */
+ 		pfree(tss->buf2);
+ 		tss->buflen2 = Max(bsize + 1, Min(tss->buflen2 * 2, MaxAllocSize));
+ 		tss->buf2 = palloc(tss->buflen2);
+ 		goto retry;
+ 	}
+ 
+ 	/*
+ 	 * Maintain approximate cardinality of both abbreviated keys and original,
+ 	 * authoritative keys using HyperLogLog.  Used as cheap insurance against
+ 	 * the worst case, where we do many string transformations for no saving in
+ 	 * full strcoll()-based comparisons.  These statistics are used by
+ 	 * bttext_abort().
+ 	 *
+ 	 * First, Hash key proper, or a significant fraction of it.  Mix in length
+ 	 * in order to compensate for cases where differences are past
+ 	 * CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
+ 	 */
+ 	hash = hash_any((unsigned char *) tss->buf1, Min(len, CACHE_LINE_SIZE));
+ 
+ 	if (len > CACHE_LINE_SIZE)
+ 		hash ^= DatumGetUInt32(hash_uint32((uint32) len));
+ 
+ 	addHyperLogLog(&tss->full_card, hash);
+ 
+ 	memcpy(pres, tss->buf2, Min(sizeof(Datum), bsize));
+ 
+ 	/* Hash abbreviated key */
+ 	lohalf = (uint32) res;
+ 	hihalf = (uint32) (res >> 32);
+ 	hash = hash_uint32(lohalf ^ hihalf);
+ 
+ 	addHyperLogLog(&tss->abbr_card, hash);
+ 
+ 	/*
+ 	 * Every Datum byte is compared.  This is safe because the strxfrm() blob
+ 	 * is itself NULL-terminated, leaving no danger of misinterpreting any NULL
+ 	 * bytes not intended to be interpreted as logically representing
+ 	 * termination.
+ 	 */
+ 	return res;
+ }
+ 
+ /*
+  * Callback for estimating effectiveness of abbreviated key optimization, using
+  * heuristic rules.  Returns value indicating if the abbreviation optimization
+  * should be aborted, based on its projected effectiveness.
+  */
+ static bool
+ bttext_abort(int memtupcount, double rowhint, SortSupport ssup)
+ {
+ 	TextSortSupport	   *tss = (TextSortSupport *) ssup->ssup_extra;
+ 	double				abbrev_distinct,
+ 						key_distinct,
+ 						norm_key_card;
+ 
+ 	Assert(ssup->position == sortKeyAbbreviated);
+ 
+ 	/* Have a little patience, even without a rowhint */
+ 	if (memtupcount < 20)
+ 		return false;
+ 
+ 	if (rowhint > 0)
+ 	{
+ 		double		normalized_rows_to_process,
+ 					estimated_cmps;
+ 
+ 		normalized_rows_to_process = (rowhint - memtupcount) / rowhint;
+ 
+ 		if (normalized_rows_to_process > 0.95 && memtupcount < 200000)
+ 		{
+ 			/*
+ 			 * Be patient -- don't consider aborting until we've processed an
+ 			 * estimated 5% of all rows to be sorted, or 200,000 rows,
+ 			 * whichever is less.
+ 			 */
+ #ifdef DEBUG_ABBREV_KEYS
+ 			elog(DEBUG_elog_output, "conversion patiently waited after %d tuples of %f",
+ 				 memtupcount, rowhint);
+ #endif
+ 			return false;
+ 		}
+ 		else if (normalized_rows_to_process < 0.65)
+ 		{
+ 			/* Already too invested -- don't abort a marginal case */
+ 			return false;
+ 		}
+ 
+ 		/*
+ 		 * strxfrm() is strongly recommended for large lists of strings.  This
+ 		 * is because despite the memory overhead often implied by an approach
+ 		 * using string transformation, the number of comparisons that a
+ 		 * comparison sort algorithm requires increases at least in proportion
+ 		 * to O(n log n).  Linearithmic growth will result in a number of
+ 		 * comparisons that is considerably higher than the number of elements.
+ 		 * (top-N heapsorts never use the abbreviation optimization, and so are
+ 		 * not considered here.)
+ 		 *
+ 		 * Unicode Technical Standard #10 states "Because binary comparison is
+ 		 * much faster than string comparison, it is faster to use sort keys
+ 		 * whenever there will be more than about 10 comparisons per string, if
+ 		 * the system can afford the storage".  That would amount to
+ 		 * approximately 1,000 list elements on average.  While our costs are
+ 		 * clearly different in several ways, this calculus cannot be ignored
+ 		 * entirely.  Past a certain point, we are probabilistically better off
+ 		 * holding out for some improvement even if there is an abbreviated key
+ 		 * cardinality of 1 thus far.  That point is somewhat arbitrarily
+ 		 * assumed to be 20 comparisons per string (approximately 1 million
+ 		 * estimated rows).  We may still lose, but not by terribly much, and
+ 		 * only in cases close to the most pessimal worst case.  Even in that
+ 		 * very worst case, as this tuple count threshold is crossed the
+ 		 * regression for internal sorts is at or under 5%.  This is helped by
+ 		 * fmgr elision, and not hurt much by useless abbreviated comparisons.
+ 		 * (a strxfrm() is expensive, a memcmp() of a Datum already in L1 cache
+ 		 * much less so.)
+ 		 */
+ 		estimated_cmps = rowhint * LOG2(rowhint);
+ 
+ 		if (estimated_cmps > rowhint * 20)
+ 		{
+ #ifdef DEBUG_ABBREV_KEYS
+ 			elog(DEBUG_elog_output, "row estimate too high (%f, estimated cmps: %f) to ever abort",
+ 				 rowhint, estimated_cmps);
+ #endif
+ 			return false;
+ 		}
+ 	}
+ 
+ 	abbrev_distinct = estimateHyperLogLog(&tss->abbr_card);
+ 	key_distinct = estimateHyperLogLog(&tss->full_card);
+ 
+ 	/*
+ 	 * Clamp cardinality estimates to at least one distinct value.  While NULLs
+ 	 * are generally disregarded, if only NULL values were seen so far, that
+ 	 * might misrepresent costs if we failed to clamp.
+ 	 */
+ 	if (abbrev_distinct <= 1.0)
+ 		abbrev_distinct = 1.0;
+ 
+ 	if (key_distinct <= 1.0)
+ 		key_distinct = 1.0;
+ 
+ 	/*
+ 	 * In the worst case all abbreviated keys are identical, while at the same
+ 	 * time there are differences within full key strings not captured in
+ 	 * abbreviations.
+ 	 */
+ #ifdef DEBUG_ABBREV_KEYS
+ 	{
+ 		double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
+ 
+ 		elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f)",
+ 			 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card);
+ 	}
+ #endif
+ 
+ 	/*
+ 	 * If the number of distinct abbreviated keys approximately matches the
+ 	 * number of distinct authoritative original keys, that's reason enough to
+ 	 * proceed.  We can win even with a very low cardinality set if most
+ 	 * tie-breakers only memcmp().  This is by far the most important
+ 	 * consideration.
+ 	 *
+ 	 * While comparisons that are resolved at the abbreviated key level are
+ 	 * considerably cheaper than tie-breakers resolved with memcmp(), both of
+ 	 * those two outcomes are so much cheaper than a full strcoll() once
+ 	 * sorting is underway that it doesn't seem worth it to weigh abbreviated
+ 	 * cardinality against the overall size of the set in order to more
+ 	 * accurately model costs.
+ 	 */
+ 	if (abbrev_distinct > key_distinct * 0.05)
+ 		return false;
+ 
+ 	/*
+ 	 * Normalized cardinality is proportion of distinct original, authoritative
+ 	 * keys
+ 	 */
+ 	norm_key_card = key_distinct / (double) memtupcount;
+ 
+ 	/*
+ 	 * If this is a very low cardinality set generally, that is reason enough
+ 	 * to favor our strategy, since many comparisons can be resolved with
+ 	 * inexpensive memcmp() tie-breakers, even though abbreviated keys are of
+ 	 * marginal utility.
+ 	 */
+ 	if (norm_key_card < 0.05)
+ 		return false;
+ 
+ 	/*
+ 	 * Abort abbreviation strategy.
+ 	 *
+ 	 * The worst case, where all abbreviated keys are identical while all
+ 	 * original strings differ will typically only see a regression of about
+ 	 * 10% in execution time for small to medium sized lists of strings.
+ 	 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
+ 	 * often expect very large improvements, particularly with sets of strings
+ 	 * of moderately high to high abbreviated cardinality.  There is little to
+ 	 * lose but much to gain, which our strategy reflects.
+ 	 */
+ #ifdef DEBUG_ABBREV_KEYS
+ 	elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f",
+ 		 memtupcount, abbrev_distinct, key_distinct);
+ 	/* Actually abort only when debugging is disabled */
+ 	return false;
+ #endif
+ 
+ 	return true;
+ }
+ 
  Datum
  text_larger(PG_FUNCTION_ARGS)
  {
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
new file mode 100644
index 8e57505..b0041b9
*** a/src/backend/utils/sort/tuplesort.c
--- b/src/backend/utils/sort/tuplesort.c
*************** bool		optimize_bounded_sort = true;
*** 150,156 ****
   * When sorting single Datums, the data value is represented directly by
   * datum1/isnull1.  If the datatype is pass-by-reference and isnull1 is false,
   * then datum1 points to a separately palloc'd data value that is also pointed
!  * to by the "tuple" pointer; otherwise "tuple" is NULL.
   *
   * While building initial runs, tupindex holds the tuple's run number.  During
   * merge passes, we re-use it to hold the input tape number that each tuple in
--- 150,159 ----
   * When sorting single Datums, the data value is represented directly by
   * datum1/isnull1.  If the datatype is pass-by-reference and isnull1 is false,
   * then datum1 points to a separately palloc'd data value that is also pointed
!  * to by the "tuple" pointer; otherwise "tuple" is NULL.  There is one special
!  * case:  when the sort support infrastructure provides an "abbreviated key"
!  * representation, where the key is (typically) a pass by value proxy for a
!  * pass by reference type.
   *
   * While building initial runs, tupindex holds the tuple's run number.  During
   * merge passes, we re-use it to hold the input tape number that each tuple in
*************** struct Tuplesortstate
*** 353,358 ****
--- 356,370 ----
  	SortSupport onlyKey;
  
  	/*
+ 	 * Additional state for managing "abbreviated key" sortsupport routines
+ 	 * (currently only used in the MinimalTuple  case).  Tracks the intervals
+ 	 * at which the optimization's effectiveness is tested.
+ 	 */
+ 	int			nextabbrev;		/* Tuple # at which to next check applicability */
+ 	bool		aborted;		/* Abbreviation process aborted? */
+ 	double		rowsHint;		/* Hint # rows to be sorted, -1 if unknown */
+ 
+ 	/*
  	 * These variables are specific to the CLUSTER case; they are set by
  	 * tuplesort_begin_cluster.  Note CLUSTER also uses tupDesc and
  	 * indexScanKey.
*************** tuplesort_begin_heap(TupleDesc tupDesc,
*** 600,606 ****
  					 int nkeys, AttrNumber *attNums,
  					 Oid *sortOperators, Oid *sortCollations,
  					 bool *nullsFirstFlags,
! 					 int workMem, bool randomAccess)
  {
  	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
  	MemoryContext oldcontext;
--- 612,619 ----
  					 int nkeys, AttrNumber *attNums,
  					 Oid *sortOperators, Oid *sortCollations,
  					 bool *nullsFirstFlags,
! 					 int workMem, double planRows,
! 					 bool randomAccess)
  {
  	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
  	MemoryContext oldcontext;
*************** tuplesort_begin_heap(TupleDesc tupDesc,
*** 632,637 ****
--- 645,652 ----
  	state->reversedirection = reversedirection_heap;
  
  	state->tupDesc = tupDesc;	/* assume we need not copy tupDesc */
+ 	state->nextabbrev = 10;
+ 	state->rowsHint = planRows;
  
  	/* Prepare SortSupport data for each column */
  	state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
*************** tuplesort_begin_heap(TupleDesc tupDesc,
*** 648,657 ****
  		sortKey->ssup_nulls_first = nullsFirstFlags[i];
  		sortKey->ssup_attno = attNums[i];
  
  		PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
  	}
  
! 	if (nkeys == 1)
  		state->onlyKey = state->sortKeys;
  
  	MemoryContextSwitchTo(oldcontext);
--- 663,687 ----
  		sortKey->ssup_nulls_first = nullsFirstFlags[i];
  		sortKey->ssup_attno = attNums[i];
  
+ 		/*
+ 		 * Convey to sortsupport routine if abbreviatioin optimization is
+ 		 * applicable in principle
+ 		 */
+ 		if (i == 0)
+ 			sortKey->position = sortKeyAbbreviated;
+ 		else
+ 			sortKey->position = sortKeyOther;
+ 
  		PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
  	}
  
! 	/*
! 	 * The "onlyKey" optimization cannot be used with abbreviated keys, since
! 	 * tie-breaker comparisons may be required.  Typically, the optimization is
! 	 * only of value to pass-by-value types anyway, whereas abbreviated keys
! 	 * are typically only of value to pass-by-reference types.
! 	 */
! 	if (nkeys == 1 && !state->sortKeys->converter)
  		state->onlyKey = state->sortKeys;
  
  	MemoryContextSwitchTo(oldcontext);
*************** tuplesort_begin_datum(Oid datumType, Oid
*** 838,843 ****
--- 868,880 ----
  	/* Prepare SortSupport data */
  	state->onlyKey = (SortSupport) palloc0(sizeof(SortSupportData));
  
+ 	/*
+ 	 * Conversion to abbreviated representation infeasible in the Datum case.
+ 	 * It must be possible to subsequently fetch original datum values within
+ 	 * tuplesort_getdatum(), which would require special-case preservation of
+ 	 * original values that we prefer to avoid.
+ 	 */
+ 	state->onlyKey->position = sortKeyOther;
  	state->onlyKey->ssup_cxt = CurrentMemoryContext;
  	state->onlyKey->ssup_collation = sortCollation;
  	state->onlyKey->ssup_nulls_first = nullsFirstFlag;
*************** tuplesort_set_bound(Tuplesortstate *stat
*** 886,891 ****
--- 923,938 ----
  
  	state->bounded = true;
  	state->bound = (int) bound;
+ 
+ 	/*
+ 	 * Bounded sorts are not an effective target for abbreviated key
+ 	 * optimization.  Disable.
+ 	 */
+ 	if (state->sortKeys && state->sortKeys->converter)
+ 	{
+ 		state->sortKeys = state->sortKeys->tiebreak;
+ 		state->sortKeys->position = sortKeyOther;
+ 	}
  }
  
  /*
*************** comparetup_heap(const SortTuple *a, cons
*** 2861,2872 ****
  	int			nkey;
  	int32		compare;
  
! 	/* Compare the leading sort key */
! 	compare = ApplySortComparator(a->datum1, a->isnull1,
! 								  b->datum1, b->isnull1,
! 								  sortKey);
! 	if (compare != 0)
! 		return compare;
  
  	/* Compare additional sort keys */
  	ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
--- 2908,2922 ----
  	int			nkey;
  	int32		compare;
  
! 	if (!state->aborted)
! 	{
! 		/* Compare the leading sort key */
! 		compare = ApplySortComparator(a->datum1, a->isnull1,
! 									  b->datum1, b->isnull1,
! 									  sortKey);
! 		if (compare != 0)
! 			return compare;
! 	}
  
  	/* Compare additional sort keys */
  	ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
*************** comparetup_heap(const SortTuple *a, cons
*** 2874,2879 ****
--- 2924,2958 ----
  	rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
  	rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
  	tupDesc = state->tupDesc;
+ 
+ 	/*
+ 	 * If a leading abbreviated comparison returned 0 or abbreviation strategy
+ 	 * was abandoned, call tie-breaker comparator using original, authoritative
+ 	 * representation, which may break tie when differences were not captured
+ 	 * within abbreviated representation
+ 	 */
+ 	if (state->sortKeys->converter)
+ 	{
+ 		AttrNumber	attno = sortKey->ssup_attno;
+ 		Datum		datum1,
+ 					datum2;
+ 		bool		isnull1,
+ 					isnull2;
+ 
+ 		Assert(attno == sortKey->tiebreak->ssup_attno);
+ 		Assert(sortKey->position == sortKeyAbbreviated);
+ 		Assert(sortKey->tiebreak->position == sortKeyTiebreak);
+ 
+ 		datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
+ 		datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
+ 
+ 		compare = ApplySortComparator(datum1, isnull1,
+ 									  datum2, isnull2,
+ 									  sortKey->tiebreak);
+ 		if (compare != 0)
+ 			return compare;
+ 	}
+ 
  	sortKey++;
  	for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
  	{
*************** copytup_heap(Tuplesortstate *state, Sort
*** 2914,2923 ****
  	/* set up first-column key value */
  	htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
  	htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
! 	stup->datum1 = heap_getattr(&htup,
! 								state->sortKeys[0].ssup_attno,
! 								state->tupDesc,
! 								&stup->isnull1);
  }
  
  static void
--- 2993,3035 ----
  	/* set up first-column key value */
  	htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
  	htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
! 
! 	/* Once aborted, we give up on storing anything in datum1 entirely */
! 	if (state->aborted)
! 		return;
! 
! 	if (!state->sortKeys->converter)
! 	{
! 		/* Store ordinary Datum representation */
! 		stup->datum1 = heap_getattr(&htup,
! 									state->sortKeys[0].ssup_attno,
! 									state->tupDesc,
! 									&stup->isnull1);
! 	}
! 	else
! 	{
! 		Datum		original;
! 
! 		/* Store abbreviated key representation */
! 		original = heap_getattr(&htup, state->sortKeys[0].ssup_attno,
! 								state->tupDesc, &stup->isnull1);
! 
! 		if (stup->isnull1)
! 			stup->datum1 = original;
! 		else
! 			stup->datum1 = state->sortKeys->converter(original,
! 													  state->sortKeys);
! 
! 		if (state->memtupcount >= state->nextabbrev)
! 		{
! 			/* Check effectiveness of optimization.  Consider aborting. */
! 			state->nextabbrev *= 2;
! 			if (state->sortKeys->abort_conversion(state->memtupcount,
! 												  state->rowsHint,
! 												  state->sortKeys))
! 				state->aborted = true;
! 		}
! 	}
  }
  
  static void
*************** reversedirection_heap(Tuplesortstate *st
*** 2983,2988 ****
--- 3095,3109 ----
  		sortKey->ssup_reverse = !sortKey->ssup_reverse;
  		sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
  	}
+ 
+ 	if (state->sortKeys->tiebreak)
+ 	{
+ 		sortKey = state->sortKeys->tiebreak;
+ 
+ 		Assert(sortKey->position == sortKeyTiebreak);
+ 		sortKey->ssup_reverse = !sortKey->ssup_reverse;
+ 		sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
+ 	}
  }
  
  
diff --git a/src/include/lib/hyperloglog.h b/src/include/lib/hyperloglog.h
new file mode 100644
index ...8a1e15f
*** a/src/include/lib/hyperloglog.h
--- b/src/include/lib/hyperloglog.h
***************
*** 0 ****
--- 1,67 ----
+ /*
+  * hyperloglog.h
+  *
+  * A simple HyperLogLog cardinality estimator implementation
+  *
+  * Portions Copyright (c) 2014, PostgreSQL Global Development Group
+  *
+  * Based on Hideaki Ohno's C++ implementation.  The copyright term's of Ohno's
+  * original version (the MIT license) follow.
+  *
+  * src/include/lib/hyperloglog.h
+  */
+ 
+ /*
+  * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a copy
+  * of this software and associated documentation files (the 'Software'), to
+  * deal in the Software without restriction, including without limitation the
+  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  * sell copies of the Software, and to permit persons to whom the Software is
+  * furnished to do so, subject to the following conditions:
+  *
+  * The above copyright notice and this permission notice shall be included in
+  * all copies or substantial portions of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+  * IN THE SOFTWARE.
+  */
+ 
+ #ifndef HYPERLOGLOG_H
+ #define HYPERLOGLOG_H
+ 
+ /*
+  * HyperLogLog is an approximate technique for computing the number of distinct
+  * entries in a set.  Importantly, it does this by using a fixed amount of
+  * memory.  See the 2007 paper "HyperLogLog: the analysis of a near-optimal
+  * cardinality estimation algorithm" for more.
+  *
+  * hyperLogLogState
+  *
+  *		registerWidth		register width, in bits ("k")
+  *		nRegisters			number of registers
+  *		alphaMM				alpha * m ^ 2 (see initHyperLogLog())
+  *		hashesArr			array of hashes
+  *		arrSize				size of hashesArr
+  */
+ typedef struct hyperLogLogState
+ {
+ 	uint8		registerWidth;
+ 	Size		nRegisters;
+ 	double		alphaMM;
+ 	uint8	   *hashesArr;
+ 	Size		arrSize;
+ } hyperLogLogState;
+ 
+ extern void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth);
+ extern void	addHyperLogLog(hyperLogLogState *cState, uint32 hash);
+ extern double estimateHyperLogLog(hyperLogLogState *cState);
+ extern void mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState);
+ 
+ #endif   /* HYPERLOGLOG_H */
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
new file mode 100644
index d78f38e..a27f5e3
*** a/src/include/pg_config_manual.h
--- b/src/include/pg_config_manual.h
***************
*** 222,234 ****
  #endif
  
  /*
!  * Assumed cache line size. This doesn't affect correctness, but can be
!  * used for low-level optimizations. Currently, this is only used to pad
!  * some data structures in xlog.c, to ensure that highly-contended fields
!  * are on different cache lines. Too small a value can hurt performance due
!  * to false sharing, while the only downside of too large a value is a few
!  * bytes of wasted memory. The default is 128, which should be large enough
!  * for all supported platforms.
   */
  #define CACHE_LINE_SIZE		128
  
--- 222,234 ----
  #endif
  
  /*
!  * Assumed cache line size. This doesn't affect correctness, but can be used
!  * for low-level optimizations. Currently, this is used to pad some data
!  * structures in xlog.c, to ensure that highly-contended fields are on
!  * different cache lines. Too small a value can hurt performance due to false
!  * sharing, while the only downside of too large a value is a few bytes of
!  * wasted memory. The default is 128, which should be large enough for all
!  * supported platforms.
   */
  #define CACHE_LINE_SIZE		128
  
diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h
new file mode 100644
index 8b6b0de..e7cc0b8
*** a/src/include/utils/sortsupport.h
--- b/src/include/utils/sortsupport.h
***************
*** 21,27 ****
   * required to provide all of them.  The BTSORTSUPPORT function should
   * simply not set any function pointers for mechanisms it doesn't support.
   * (However, all opclasses that provide BTSORTSUPPORT are required to provide
!  * the comparator function.)
   *
   * All sort support functions will be passed the address of the
   * SortSupportData struct when called, so they can use it to store
--- 21,29 ----
   * required to provide all of them.  The BTSORTSUPPORT function should
   * simply not set any function pointers for mechanisms it doesn't support.
   * (However, all opclasses that provide BTSORTSUPPORT are required to provide
!  * the comparator function, and opclasses that support the optional additional
!  * abbreviated key capability must provide a tiebreaker state, converter and a
!  * comparator that works with the abbreviated representation.)
   *
   * All sort support functions will be passed the address of the
   * SortSupportData struct when called, so they can use it to store
***************
*** 49,54 ****
--- 51,63 ----
  
  #include "access/attnum.h"
  
+ typedef enum
+ {
+ 	sortKeyOther,		/* Second/subsequent key, or no abbreviation capability */
+ 	sortKeyAbbreviated,	/* Leading (abbreviated applicable) key (when available) */
+ 	sortKeyTiebreak,	/* "True" leading key, abbreviation tie-breaker */
+ } SortKeyPosition;
+ 
  typedef struct SortSupportData *SortSupport;
  
  typedef struct SortSupportData
*************** typedef struct SortSupportData
*** 92,103 ****
  	 * than, equal to, or greater than y.  Note that x and y are guaranteed
  	 * not null, and there is no way to return null either.  Do not return
  	 * INT_MIN, as callers are allowed to negate the result before using it.
  	 */
  	int			(*comparator) (Datum x, Datum y, SortSupport ssup);
  
  	/*
! 	 * Additional sort-acceleration functions might be added here later.
  	 */
  } SortSupportData;
  
  
--- 101,203 ----
  	 * than, equal to, or greater than y.  Note that x and y are guaranteed
  	 * not null, and there is no way to return null either.  Do not return
  	 * INT_MIN, as callers are allowed to negate the result before using it.
+ 	 *
+ 	 * This comparator may require a tie-breaker for opclasses with additional
+ 	 * special support for dealing with an abbreviated key representation.
  	 */
  	int			(*comparator) (Datum x, Datum y, SortSupport ssup);
  
  	/*
! 	 * "Abbreviated key" infrastructure follows.  All callbacks must be set by
! 	 * sortsupport opclasses that make use of this optional additional
! 	 * infrastructure.
! 	 *
! 	 * This allows opclass authors to supply a conversion routine, used to
! 	 * create an alternative representation of the underlying type (an
! 	 * "abbreviated key").  Typically, this representation is an ad-hoc,
! 	 * pass-by-value Datum format that only the opclass has knowledge of.  An
! 	 * alternative comparator, used only with this alternative representation
! 	 * must also be provided.  This representation is a simple approximation of
! 	 * the original Datum.  It must be possible to compare datums of this
! 	 * representation with each other using the supplied alternative
! 	 * comparator, and have any non-zero return value be a reliable proxy for
! 	 * what a proper comparison would indicate.  Returning zero from the
! 	 * alternative comparator does not indicate equality, as with a
! 	 * conventional support routine 1, though -- it indicates that it wasn't
! 	 * possible to determine how the two abbreviated values compared.  A proper
! 	 * comparison is therefore required.  In many cases this results in most or
! 	 * all comparisons only using the cheap alternative comparison func, which
! 	 * is typically implemented as code that compiles to just a few CPU
! 	 * instructions.  The technique is particularly useful for internal sorts
! 	 * (i.e. quicksort).  CPU cache miss penalties have grown to the point
! 	 * where good overall performance must heavily weigh cache performance.
! 	 *
! 	 * Opclass authors must consider the final cardinality of abbreviated keys
! 	 * when devising an encoding scheme.  It's possible for a strategy to work
! 	 * better than an alternative strategy with one usage pattern, while the
! 	 * reverse might be true for another usage pattern.  All of these factors
! 	 * should be weighed.
! 	 */
! 
! 	/*
! 	 * Sort key "position" mostly just relates to whether or not the
! 	 * abbreviated key optimization is applicable in principle (that is, the
! 	 * sortsupport routine needs to know if its dealing with a leading key).
! 	 * Even with a leading key, internal sortsupport clients like tuplesort may
! 	 * represent it as sortKeyOther because it isn't feasible to inject the
! 	 * conversion routine.  However, sortKeyTiebreak means that it's a "proper"
! 	 * sortsupport state, originally generated by the sortsupport routine
! 	 * itself - the core system will never directly create a sort support state
! 	 * with tie-break position.  There is very little distinction between
! 	 * sortKeyTiebreak  and sortKeyOther positions, though.  The distinction
! 	 * only exists to allow sortsupport routines to squeeze a bit more
! 	 * performance from the knowledge that an authoritative tie-breaker
! 	 * comparison is required because a prior alternative comparison didn't
! 	 * work out (as opposed to being called without there ever being an
! 	 * abbreviated comparison of the tuple attribute).
! 	 *
! 	 * A sortsupport routine may initially decide against applying the
! 	 * abbreviation optimization by setting a passed sortKeyAbbreviated sort
! 	 * support state's position to sortKeyOther.  This is typically used for
! 	 * platform-specific special cases where the optimization is thought to be
! 	 * ineffective.
! 	 */
! 	SortKeyPosition		position;
! 
! 	/*
! 	 * Converter to abbreviated format, from original representation.  Core
! 	 * code uses this callback to convert from a pass-by-reference "original"
! 	 * Datum to a pass-by-value abbreviated key Datum.  Note that original is
! 	 * guaranteed not null because it doesn't make sense to factor NULL values
! 	 * into ad-hoc cost model.
! 	 */
! 	Datum		(*converter) (Datum original, SortSupport ssup);
! 
! 	/*
! 	 * This callback allows clients to verify that the current strategy is
! 	 * working out, using a sortsupport routine defined ad-hoc cost model.  If
! 	 * there is a lot of duplicate abbreviated keys in practice, it's useful to
! 	 * be able to abandon the strategy before paying too high a cost in
! 	 * conversion.  rowhint is typically an estimate of total rows that will be
! 	 * sorted originating from the planner, but clients are at liberty to
! 	 * provide a new hint with each call, should better information become
! 	 * available during the course of conversion. (By convention, negative
! 	 * rowhint values are interpreted as "no hint available".)
! 	 */
! 	bool		(*abort_conversion) (int memtupcount, double rowhint,
! 									 SortSupport ssup);
! 
! 	/*
! 	 * Alternative tiebreak SortSupport state for leading (abbreviated) key,
! 	 * used only when alternative comparator returned 0, and the core system
! 	 * must use this separate state to perform an authoritative comparison.
! 	 * This relates to the same attribute as our ssup_attno, but code like
! 	 * tuplesort is required to call it directly as a tie-breaker.
! 	 *
! 	 * It is only ever initialized by a SortSupport routine with abbreviation
! 	 * support, and not any internal code.
  	 */
+ 	SortSupport			tiebreak;
  } SortSupportData;
  
  
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
new file mode 100644
index 2537883..e5de4f4
*** a/src/include/utils/tuplesort.h
--- b/src/include/utils/tuplesort.h
*************** extern Tuplesortstate *tuplesort_begin_h
*** 62,68 ****
  					 int nkeys, AttrNumber *attNums,
  					 Oid *sortOperators, Oid *sortCollations,
  					 bool *nullsFirstFlags,
! 					 int workMem, bool randomAccess);
  extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
  						Relation indexRel,
  						int workMem, bool randomAccess);
--- 62,68 ----
  					 int nkeys, AttrNumber *attNums,
  					 Oid *sortOperators, Oid *sortCollations,
  					 bool *nullsFirstFlags,
! 					 int workMem, double planRows, bool randomAccess);
  extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
  						Relation indexRel,
  						int workMem, bool randomAccess);

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] B-Tree support function number 3 (strxfrm() optimization)

Reply via email to