Re: [HACKERS] Statistics and selectivity estimation for ranges

Heikki Linnakangas Fri, 24 Aug 2012 08:52:05 -0700

On 20.08.2012 00:31, Alexander Korotkov wrote:

On Thu, Aug 16, 2012 at 4:40 PM, Heikki Linnakangas<
[email protected]>  wrote:

On 15.08.2012 11:34, Alexander Korotkov wrote:

Ok, we've to decide if we need "standard" histogram. In some cases it can
be used for more accurate estimation of<   and>   operators.
But I think it is not so important. So, we can replace "standard"
histogram
with histograms of lower and upper bounds?


Yeah, I think that makes more sense. The lower bound histogram is still
useful for<  and>  operators, just not as accurate if there are lots of
values with the same lower bound but different upper bound.



New version of patch.
* Collect new stakind STATISTIC_KIND_BOUNDS_HISTOGRAM, which is lower and
upper bounds histograms combined into single ranges array, instead
of STATISTIC_KIND_HISTOGRAM.

One worry I have about that format for the histogram is that youdeserialize all the values in the histogram, before you do the binarysearches. That seems expensive if stats target is very high. I guess youcould deserialize them lazily to alleviate that, though.

* Selectivity estimations for>,>=,<,<=,<<,>>,&<,&>  using this
histogram.


Thanks!

I'm going to do the same for this that I did for the sp-gist patch, andpunt on the more complicated parts for now, and review them separately.Attached is a heavily edited version that doesn't include the lengthhistogram, and consequently doesn't do anything smart for the &< and &>operators. && is estimated using the bounds histograms. There's now aseparate stakind for the empty range fraction, since it's not includedin the length-histogram.

I tested this on a dataset containing birth and death dates of personsthat have a wikipedia page, obtained from the dbpedia.org project. I cansend a copy if someone wants it. The estimates seem pretty accurate.


Please take a look, to see if I messed up something.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index a692086..a929f4a 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -30,7 +30,8 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \
 	tsginidx.o tsgistidx.o tsquery.o tsquery_cleanup.o tsquery_gist.o \
 	tsquery_op.o tsquery_rewrite.o tsquery_util.o tsrank.o \
 	tsvector.o tsvector_op.o tsvector_parser.o \
-	txid.o uuid.o windowfuncs.o xml.o rangetypes_spgist.o
+	txid.o uuid.o windowfuncs.o xml.o rangetypes_spgist.o \
+	rangetypes_typanalyze.o rangetypes_selfuncs.o
 
 like.o: like.c like_match.c
 
diff --git a/src/backend/utils/adt/rangetypes.c b/src/backend/utils/adt/rangetypes.c
index fe9e0c4..f229a9d 100644
--- a/src/backend/utils/adt/rangetypes.c
+++ b/src/backend/utils/adt/rangetypes.c
@@ -1228,23 +1228,6 @@ hash_range(PG_FUNCTION_ARGS)
 	PG_RETURN_INT32(result);
 }
 
-/* ANALYZE support */
-
-/* typanalyze function for range datatypes */
-Datum
-range_typanalyze(PG_FUNCTION_ARGS)
-{
-	/*
-	 * For the moment, just punt and don't analyze range columns.  If we get
-	 * close to release without having a better answer, we could consider
-	 * letting std_typanalyze do what it can ... but those stats are probably
-	 * next door to useless for most activity with range columns, so it's not
-	 * clear it's worth gathering them.
-	 */
-	PG_RETURN_BOOL(false);
-}
-
-
 /*
  *----------------------------------------------------------
  * CANONICAL FUNCTIONS
diff --git a/src/backend/utils/adt/rangetypes_selfuncs.c b/src/backend/utils/adt/rangetypes_selfuncs.c
new file mode 100644
index 0000000..ebfc427
--- /dev/null
+++ b/src/backend/utils/adt/rangetypes_selfuncs.c
@@ -0,0 +1,560 @@
+/*-------------------------------------------------------------------------
+ *
+ * rangetypes_selfuncs.c
+ *	  Functions for selectivity estimation of range operators
+ *
+ * Estimates are based on histograms of lower and upper bounds.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/rangetypes_selfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "catalog/pg_operator.h"
+#include "catalog/pg_statistic.h"
+#include "utils/lsyscache.h"
+#include "utils/rangetypes.h"
+#include "utils/selfuncs.h"
+#include "utils/typcache.h"
+
+static double calc_hist_selectivity(TypeCacheEntry *typcache,
+					VariableStatData *vardata, RangeType *constval, Oid operator);
+static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
+							 RangeBound *constbound,
+							 RangeBound *hist, int hist_nvalues,
+							 bool equal);
+
+static int range_bsearch(TypeCacheEntry *typcache, RangeBound *value,
+			  RangeBound *hist, int hist_length, bool equal);
+static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
+			  RangeType *constval, Oid operator);
+static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
+			 RangeBound *hist1, RangeBound *hist2);
+
+/*
+ * Returns a default selectivity estimate for given operator, when we don't
+ * have statistics or cannot use them for some reason
+ */
+static double
+default_range_selectivity(Oid operator)
+{
+	switch (operator)
+	{
+		case OID_RANGE_OVERLAP_OP:
+			return 0.01;
+
+		case OID_RANGE_CONTAINS_OP:
+		case OID_RANGE_CONTAINED_OP:
+			return 0.005;
+
+		case OID_RANGE_CONTAINS_ELEM_OP:
+			/*
+			 * "range @> elem" is more or less identical to a scalar
+			 * inequality "A > b AND A < c".
+			 */
+			return DEFAULT_RANGE_INEQ_SEL;
+
+		case OID_RANGE_LESS_OP:
+		case OID_RANGE_LESS_EQUAL_OP:
+		case OID_RANGE_GREATER_OP:
+		case OID_RANGE_GREATER_EQUAL_OP:
+		case OID_RANGE_LEFT_OP:
+		case OID_RANGE_RIGHT_OP:
+			/* These are similar to regular scalar inequalities */
+			return DEFAULT_INEQ_SEL;
+
+		default:
+			/* all range operators should be handled above, but just in case */
+			return 0.01;
+	}
+}
+
+/*
+ * rangesel -- restriction selectivity for range operators
+ */
+Datum
+rangesel(PG_FUNCTION_ARGS)
+{
+	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+	Oid			operator = PG_GETARG_OID(1);
+	List	   *args = (List *) PG_GETARG_POINTER(2);
+	int			varRelid = PG_GETARG_INT32(3);
+	VariableStatData vardata;
+	Node	   *other;
+	bool		varonleft;
+	Selectivity selec;
+	TypeCacheEntry *typcache;
+	RangeType  *constval = NULL;
+
+	/*
+	 * If expression is not (variable op something) or (something op
+	 * variable), then punt and return a default estimate.
+	 */
+	if (!get_restriction_variable(root, args, varRelid,
+								  &vardata, &other, &varonleft))
+		PG_RETURN_FLOAT8(default_range_selectivity(operator));
+
+	/*
+	 * Can't do anything useful if the something is not a constant, either.
+	 */
+	if (!IsA(other, Const))
+	{
+		ReleaseVariableStats(vardata);
+		PG_RETURN_FLOAT8(default_range_selectivity(operator));
+	}
+
+	/*
+	 * All the range operators are strict, so we can cope with a NULL constant
+	 * right away.
+	 */
+	if (((Const *) other)->constisnull)
+	{
+		ReleaseVariableStats(vardata);
+		PG_RETURN_FLOAT8(0.0);
+	}
+
+	/*
+	 * If var is on the right, commute the operator, so that we can assume the
+	 * var is on the left in what follows.
+	 */
+	if (!varonleft)
+	{
+		/* we have other Op var, commute to make var Op other */
+		operator = get_commutator(operator);
+		if (!operator)
+		{
+			/* Use default selectivity (should we raise an error instead?) */
+			ReleaseVariableStats(vardata);
+			PG_RETURN_FLOAT8(default_range_selectivity(operator));
+		}
+	}
+
+	typcache = range_get_typcache(fcinfo, vardata.vartype);
+
+	/*
+	 * OK, there's a Var and a Const we're dealing with here.  We need the
+	 * Const to be of same range type as column, else we can't do anything
+	 * useful. (Such cases will likely fail at runtime, but here we'd rather
+	 * just return a default estimate.)
+	 *
+	 * If the operator is "range @> element", the constant should be of the
+	 * element type of the range column. Convert it to a range that includes
+	 * only that single point, so that we don't need special handling for
+	 * that in what follows.
+	 */
+	if (operator == OID_RANGE_CONTAINS_ELEM_OP)
+	{
+		if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
+		{
+			/* Transform "col @> const_elem" case to "col @> const_range" case */
+			RangeBound lower, upper;
+			lower.inclusive = true;
+			lower.val = ((Const *) other)->constvalue;
+			lower.infinite = false;
+			lower.lower = true;
+			upper.inclusive = true;
+			upper.val = ((Const *) other)->constvalue;
+			upper.infinite = false;
+			upper.lower = false;
+			constval = range_serialize(typcache, &lower, &upper, false);
+		}
+	}
+	else
+	{
+		if (((Const *) other)->consttype == vardata.vartype)
+			constval = DatumGetRangeType(((Const *) other)->constvalue);
+	}
+
+	if (constval)
+		selec = calc_rangesel(typcache, &vardata, constval, operator);
+	else
+		selec = default_range_selectivity(operator);
+
+	ReleaseVariableStats(vardata);
+
+	CLAMP_PROBABILITY(selec);
+
+	PG_RETURN_FLOAT8((float8) selec);
+}
+
+/*
+ * calc_rangesel -- calculate restriction selectivity for range operators.
+ */
+static double
+calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
+			  RangeType *constval, Oid operator)
+{
+	double		hist_selec;
+	double		selec;
+	float4	   *numbers, empty_frac, null_frac;
+	int			nnumbers;
+
+	if (HeapTupleIsValid(vardata->statsTuple))
+	{
+		Form_pg_statistic stats;
+		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+		null_frac = stats->stanullfrac;
+
+		/* Try to get fraction of empty ranges */
+		if (get_attstatsslot(vardata->statsTuple,
+							 vardata->atttype, vardata->atttypmod,
+							 STATISTIC_KIND_RANGE_EMPTY_FRAC, InvalidOid,
+							 NULL,
+							 NULL, NULL,
+							 &numbers, &nnumbers))
+		{
+			if (nnumbers != 1)
+				elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
+			empty_frac = numbers[0];
+		}
+		else
+		{
+			/* No empty fraction statistic. Assume no empty ranges. */
+			empty_frac = 0.0;
+		}
+	}
+	else
+	{
+		/*
+		 * No stats are available. Follow through the calculations below
+		 * anyway, assuming no NULLs and no empty ranges. This still allows
+		 * us to give a better-than-nothing estimate based on whether the
+		 * constant is an empty range or not.
+		 */
+		null_frac = 0.0;
+		empty_frac = 0.0;
+	}
+
+	if (RangeIsEmpty(constval))
+	{
+		/*
+		 * An empty range matches all, all empty ranges, or nothing,
+		 * depending on the operator
+		 */
+		switch (operator)
+		{
+			case OID_RANGE_OVERLAP_OP:
+			case OID_RANGE_OVERLAPS_LEFT_OP:
+			case OID_RANGE_OVERLAPS_RIGHT_OP:
+			case OID_RANGE_LEFT_OP:
+			case OID_RANGE_RIGHT_OP:
+				/* these return false if either argument is empty */
+				selec = 0.0;
+				break;
+
+			case OID_RANGE_CONTAINED_OP:
+			case OID_RANGE_LESS_EQUAL_OP:
+			case OID_RANGE_GREATER_EQUAL_OP:
+				/*
+				 * These return true when both args are empty, false if only
+				 * one is empty.
+				 */
+				selec = empty_frac;
+				break;
+
+			case OID_RANGE_CONTAINS_OP:
+				/* everything contains an empty range */
+				selec = 1.0;
+				break;
+
+			case OID_RANGE_CONTAINS_ELEM_OP:
+			default:
+				elog(ERROR, "unexpected operator %u", operator);
+				selec = 0.0; /* keep compiler quiet */
+				break;
+		}
+	}
+	else
+	{
+		/*
+		 * Calculate selectivity using bound histograms.
+		 */
+		hist_selec = calc_hist_selectivity(typcache, vardata, constval, operator);
+		if (hist_selec < 0.0)
+			hist_selec = default_range_selectivity(operator);
+
+		/*
+		 * Now merge the results for the empty ranges and histogram
+		 * calculations, realizing that the histogram covers only the
+		 * non-null, non-empty values.
+		 */
+		if (operator == OID_RANGE_CONTAINED_OP)
+		{
+			/* empty is contained by anything non-empty */
+			selec = (1.0 - empty_frac) * hist_selec + empty_frac;
+		}
+		else
+		{
+			/*
+			 * With any other operator, empty Op non-empty matches nothing.
+			 */
+			selec = (1.0 - empty_frac) * hist_selec;
+		}
+	}
+
+	/* all range operators are strict */
+	selec *= (1.0 - null_frac);
+
+	/* result should be in range, but make sure... */
+	CLAMP_PROBABILITY(selec);
+
+	return selec;
+}
+
+/*
+ * Binary search on an array of range bounds. Returns greatest index of range
+ * bound in array which is less than given range bound. If all range bounds in
+ * array are greater or equal than given range bound, return -1.
+ */
+static int
+range_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
+			  int hist_length, bool equal)
+{
+	int			lower = -1,
+				upper = hist_length - 1,
+				cmp,
+				middle;
+
+	while (lower < upper)
+	{
+		middle = (lower + upper + 1) / 2;
+		cmp = range_cmp_bounds(typcache, &hist[middle], value);
+
+		if (cmp < 0 || (equal && cmp == 0))
+			lower = middle;
+		else
+			upper = middle - 1;
+	}
+	return lower;
+}
+
+/*
+ * Calculate selectivity using histograms of range bounds
+ */
+static double
+calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
+					  RangeType *constval,Oid operator)
+{
+	Datum	   *hist_values;
+	int			nhist;
+	RangeBound *hist_lower;
+	RangeBound *hist_upper;
+	int			i;
+	RangeBound	const_lower;
+	RangeBound	const_upper;
+	bool		empty;
+	double		hist_selec;
+
+	/* Try to get histogram of ranges */
+	if (!(HeapTupleIsValid(vardata->statsTuple) &&
+		  get_attstatsslot(vardata->statsTuple,
+						   vardata->atttype, vardata->atttypmod,
+						   STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+						   NULL,
+						   &hist_values, &nhist,
+						   NULL, NULL)))
+		return -1.0;
+
+	/*
+	 * Convert histogram of ranges into histogram of it's lower or upper
+	 * bounds.
+	 */
+	hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
+	hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
+	for (i = 0; i < nhist; i++)
+	{
+		range_deserialize(typcache, DatumGetRangeType(hist_values[i]),
+						  &hist_lower[i], &hist_upper[i], &empty);
+		/* The histogram should not contain any empty ranges */
+		if (empty)
+			elog(ERROR, "bounds histogram contains an empty range");
+	}
+
+	/* Extract the bounds of the constant value? */
+	range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
+	Assert (!empty);
+
+	/* Calculate selectivity of operator using corresponding function */
+	switch (operator)
+	{
+		case OID_RANGE_LESS_OP:
+			hist_selec =
+				calc_hist_selectivity_scalar(typcache, &const_lower,
+											 hist_lower, nhist, false);
+			break;
+
+		case OID_RANGE_LESS_EQUAL_OP:
+			/*
+			 * Estimate var < const by comparing the lower bounds. The upper
+			 * bound would matter for rows with equal lower bound equal to
+			 * the constant, but presumably there aren't many of those.
+			 */
+			hist_selec =
+				calc_hist_selectivity_scalar(typcache, &const_lower,
+											 hist_lower, nhist, true);
+			break;
+
+		case OID_RANGE_GREATER_OP:
+			/* Like for <, compare just the lower bounds. */
+			hist_selec =
+				1 - calc_hist_selectivity_scalar(typcache, &const_lower,
+												 hist_lower, nhist, true);
+			break;
+
+		case OID_RANGE_GREATER_EQUAL_OP:
+			hist_selec =
+				1 - calc_hist_selectivity_scalar(typcache, &const_lower,
+												 hist_lower, nhist, false);
+			break;
+
+		case OID_RANGE_LEFT_OP:
+			/* var << const if the upper bound of var < lower bound of const */
+			hist_selec =
+				calc_hist_selectivity_scalar(typcache, &const_lower,
+											 hist_upper, nhist, false);
+			break;
+
+		case OID_RANGE_RIGHT_OP:
+			/* var >> const if the lower bound of var > upper bound of const */
+			hist_selec =
+				1 - calc_hist_selectivity_scalar(typcache, &const_upper,
+												 hist_lower, nhist, true);
+			break;
+
+		case OID_RANGE_OVERLAPS_RIGHT_OP:
+			/* compare lower bounds */
+			hist_selec =
+				1 - calc_hist_selectivity_scalar(typcache, &const_lower,
+												 hist_lower, nhist, false);
+			break;
+
+		case OID_RANGE_OVERLAPS_LEFT_OP:
+			/* compare upper bounds */
+			hist_selec =
+				calc_hist_selectivity_scalar(typcache, &const_upper,
+											 hist_upper, nhist, true);
+			break;
+
+		case OID_RANGE_OVERLAP_OP:
+		case OID_RANGE_CONTAINS_ELEM_OP:
+			/*
+			 * A && B <=> NOT (A << B OR A >> B).
+			 *
+			 * "range @> elem" is equivalent to "range && [elem,elem]". The
+			 * caller already constructed the singular range from the element
+			 * constant, so just treat it the same as &&.
+			 */
+			hist_selec =
+				calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
+											 nhist, false);
+			hist_selec +=
+				(1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
+												  nhist, true));
+			hist_selec = 1.0 - hist_selec;
+			break;
+
+		case OID_RANGE_CONTAINS_OP:
+		case OID_RANGE_CONTAINED_OP:
+			/* TODO: not implemented yet */
+			hist_selec = -1.0;
+			break;
+
+		default:
+			elog(ERROR, "unknown range operator %u", operator);
+			hist_selec = -1.0; /* keep compiler quiet */
+			break;
+	}
+
+	return hist_selec;
+}
+
+/*
+ * Get relative position of value in histogram bin in [0,1] range.
+ */
+static float8
+get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
+			 RangeBound *hist2)
+{
+	float8		position;
+
+	if (!hist1->infinite && !hist2->infinite)
+	{
+		float8		bin_width;
+
+		/*
+		 * Both bounds of histogram bin aren't infinite. Since value should be
+		 * in this bin, it shouldn't be infinite.
+		 */
+		Assert(!value->infinite);
+
+		/* Calculate relative position using subdiff function. */
+		bin_width = DatumGetFloat8(FunctionCall2Coll(
+												&typcache->rng_subdiff_finfo,
+													 typcache->rng_collation,
+													 hist2->val,
+													 hist1->val));
+		if (bin_width <= 0.0)
+		{
+			/* Assume any point to be in the center of zero width bin */
+			return 0.5;
+		}
+		position = DatumGetFloat8(FunctionCall2Coll(
+												&typcache->rng_subdiff_finfo,
+													typcache->rng_collation,
+													value->val,
+													hist1->val))
+			/ bin_width;
+
+		/* Relative position must be in [0,1] range */
+		position = Max(position, 0.0);
+		position = Min(position, 1.0);
+		return position;
+	}
+	else if (!hist2->infinite)
+	{
+		/*
+		 * One bound of histogram bin in infinite, another is not. Return 0 or
+		 * 1 depending on if value is infinite.
+		 */
+		if (value->infinite)
+			return 0.0;
+		else
+			return 1.0;
+	}
+	else
+	{
+		/* All bounds should be infinite. Assume this as 0.5 position */
+		Assert(hist1->infinite && hist2->infinite && value->infinite);
+		return 0.5;
+	}
+}
+
+/*
+ * Do selectivity estimation based on comparison.
+ */
+static double
+calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
+							 RangeBound *hist, int hist_nvalues, bool equal)
+{
+	Selectivity	selec;
+	int			index;
+
+	/* Estimate selectivity as number of bins */
+	index = range_bsearch(typcache, constbound, hist, hist_nvalues, equal);
+	selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
+
+	/* Adjust selectivity using linear interpolation */
+	if (index >= 0 && index < hist_nvalues - 1)
+		selec += get_position(typcache, constbound, &hist[index],
+						&hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
+
+	return selec;
+}
diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c
new file mode 100644
index 0000000..ecf6504
--- /dev/null
+++ b/src/backend/utils/adt/rangetypes_typanalyze.c
@@ -0,0 +1,234 @@
+/*-------------------------------------------------------------------------
+ *
+ * ragetypes_typanalyze.c
+ *	  Functions for gathering statistics from range columns
+ *
+ * For a range type column, histograms of lower and upper bounds, and
+ * the fraction of NULL and empty ranges are collected.
+ *
+ * Both histograms have the same length, and they are combined into a
+ * single array of ranges. This has the same shape as the histogram that
+ * std_typanalyze would collect, but the values are different. Each range
+ * in the array is a valid range, even though the lower and upper bounds
+ * come from different tuples (for each individual value in the table,
+ * lower <= upper bound, and that's enough to guarantee that the same
+ * applies to the percentiles of lower and upper bounds). In theory, the
+ * standard scalar selectivity functions could be used with that histogram.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/rangetypes_typanalyze.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/pg_operator.h"
+#include "commands/vacuum.h"
+#include "utils/builtins.h"
+#include "utils/rangetypes.h"
+
+static void compute_range_stats(VacAttrStats *stats,
+		   AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
+
+/*
+ * range_typanalyze -- typanalyze function for range columns
+ */
+Datum
+range_typanalyze(PG_FUNCTION_ARGS)
+{
+	VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
+	TypeCacheEntry *typcache;
+	Form_pg_attribute attr = stats->attr;
+
+	/* Get information about range type */
+	typcache = range_get_typcache(fcinfo, stats->attrtypid);
+
+	if (attr->attstattarget < 0)
+        attr->attstattarget = default_statistics_target;
+
+	stats->compute_stats = compute_range_stats;
+	stats->extra_data = typcache;
+	/* Same as in std_typanalyze */
+	stats->minrows = 300 * attr->attstattarget;
+
+	PG_RETURN_BOOL(true);
+}
+
+/*
+ * Comparison function for sorting RangeBounds.
+ */
+static int
+range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
+{
+	RangeBound *b1 = (RangeBound *)a1;
+	RangeBound *b2 = (RangeBound *)a2;
+	TypeCacheEntry *typcache = (TypeCacheEntry *)arg;
+
+	return range_cmp_bounds(typcache, b1, b2);
+}
+
+/*
+ * compute_range_stats() -- compute statistics for range column
+ */
+static void
+compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
+					int samplerows, double totalrows)
+{
+	TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
+	int			null_cnt = 0;
+	int			non_null_cnt = 0;
+	int			non_empty_cnt = 0;
+	int			empty_cnt = 0;
+	int			range_no;
+	int			slot_idx;
+	int			num_bins = stats->attr->attstattarget;
+	int			num_hist;
+	RangeBound *lowers, *uppers;
+	double		total_width = 0;
+
+	/* Allocate memory for arrays of range bounds. */
+	lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
+	uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
+
+	/* Loop over the sample ranges. */
+	for (range_no = 0; range_no < samplerows; range_no++)
+	{
+		Datum		value;
+		bool		isnull,
+					empty;
+		RangeType  *range;
+		RangeBound	lower,
+					upper;
+
+		vacuum_delay_point();
+
+		value = fetchfunc(stats, range_no, &isnull);
+		if (isnull)
+		{
+			/* range is null, just count that */
+			null_cnt++;
+			continue;
+		}
+
+		/*
+		 * XXX: should we ignore wide values, like std_typanalyze does, to
+		 * avoid bloating the statistics table?
+		 */
+		total_width += VARSIZE_ANY(DatumGetPointer(value));
+
+		/* Get range and deserialize it for further analysis. */
+		range = DatumGetRangeType(value);
+		range_deserialize(typcache, range, &lower, &upper, &empty);
+
+		if (!empty)
+		{
+			/* Fill bound values for further usage in histograms */
+			lowers[non_empty_cnt] = lower;
+			uppers[non_empty_cnt] = upper;
+			non_empty_cnt++;
+		}
+		else
+			empty_cnt++;
+
+		non_null_cnt++;
+	}
+
+	slot_idx = 0;
+
+	/* We can only compute real stats if we found some non-null values. */
+	if (non_null_cnt > 0)
+	{
+		Datum	   *bound_hist_values;
+		int			pos,
+					posfrac,
+					delta,
+					deltafrac,
+					i;
+		MemoryContext old_cxt;
+		float4	   *emptyfrac;
+
+		stats->stats_valid = true;
+		/* Do the simple null-frac and width stats */
+		stats->stanullfrac = (double) null_cnt / (double) samplerows;
+		stats->stawidth = total_width / (double) non_null_cnt;
+		stats->stadistinct = -1.0;
+
+		/* Must copy the target values into anl_context */
+		old_cxt = MemoryContextSwitchTo(stats->anl_context);
+
+		if (non_empty_cnt > 0)
+		{
+			/* Sort bound values */
+			qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
+					  range_bound_qsort_cmp, typcache);
+			qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
+					  range_bound_qsort_cmp, typcache);
+
+			num_hist = non_empty_cnt;
+			if (num_hist > num_bins)
+				num_hist = num_bins + 1;
+
+			bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
+
+			/*
+			 * The object of this loop is to construct ranges from first and
+			 * last lowers[] and uppers[]  entries along with evenly-spaced
+			 * values in between. So the i'th value is range of
+			 * lowers[(i * (nvals - 1)) / (num_hist - 1)] and
+			 * uppers[(i * (nvals - 1)) / (num_hist - 1)]. But computing that
+			 * subscript directly risks integer overflow when the stats target
+			 * is more than a couple thousand.  Instead we add
+			 * (nvals - 1) / (num_hist - 1) to pos at each step, tracking the
+			 * integral and fractional parts of the sum separately.
+			 */
+			delta = (non_empty_cnt - 1) / (num_hist - 1);
+			deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
+			pos = posfrac = 0;
+
+			for (i = 0; i < num_hist; i++)
+			{
+				bound_hist_values[i] = PointerGetDatum(range_serialize(
+								typcache, &lowers[pos], &uppers[pos], false));
+				pos += delta;
+				posfrac += deltafrac;
+				if (posfrac >= (num_hist - 1))
+				{
+					/* fractional part exceeds 1, carry to integer part */
+					pos++;
+					posfrac -= (num_hist - 1);
+				}
+			}
+
+			stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
+			stats->stavalues[slot_idx] = bound_hist_values;
+			stats->numvalues[slot_idx] = num_hist;
+			slot_idx++;
+		}
+
+		/* Store the fraction of empty ranges */
+		emptyfrac = (float4 *) palloc(sizeof(float4));
+		*emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
+		stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_EMPTY_FRAC;
+		stats->stanumbers[slot_idx] = emptyfrac;
+		stats->numnumbers[slot_idx] = 1;
+		slot_idx++;
+
+		MemoryContextSwitchTo(old_cxt);
+	}
+	else if (null_cnt > 0)
+	{
+		/* We found only nulls; assume the column is entirely null */
+		stats->stats_valid = true;
+		stats->stanullfrac = 1.0;
+		stats->stawidth = 0;		/* "unknown" */
+		stats->stadistinct = 0.0;	/* "unknown" */
+	}
+	/*
+	 * We don't need to bother cleaning up any of our temporary palloc's. The
+	 * hashtable should also go away, as it used a child memory context.
+	 */
+}
diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h
index 9470254..abfec5c 100644
--- a/src/include/catalog/pg_operator.h
+++ b/src/include/catalog/pg_operator.h
@@ -1676,32 +1676,45 @@ DATA(insert OID = 3882 (  "="	   PGNSP PGUID b t t 3831 3831 16 3882 3883 range_
 DESCR("equal");
 DATA(insert OID = 3883 (  "<>"	   PGNSP PGUID b f f 3831 3831 16 3883 3882 range_ne neqsel neqjoinsel ));
 DESCR("not equal");
-DATA(insert OID = 3884 (  "<"	   PGNSP PGUID b f f 3831 3831 16 3887 3886 range_lt scalarltsel scalarltjoinsel ));
+DATA(insert OID = 3884 (  "<"	   PGNSP PGUID b f f 3831 3831 16 3887 3886 range_lt rangesel scalarltjoinsel ));
 DESCR("less than");
-DATA(insert OID = 3885 (  "<="	   PGNSP PGUID b f f 3831 3831 16 3886 3887 range_le scalarltsel scalarltjoinsel ));
+#define OID_RANGE_LESS_OP 3884
+DATA(insert OID = 3885 (  "<="	   PGNSP PGUID b f f 3831 3831 16 3886 3887 range_le rangesel scalarltjoinsel ));
 DESCR("less than or equal");
-DATA(insert OID = 3886 (  ">="	   PGNSP PGUID b f f 3831 3831 16 3885 3884 range_ge scalargtsel scalargtjoinsel ));
+#define OID_RANGE_LESS_EQUAL_OP 3885
+DATA(insert OID = 3886 (  ">="	   PGNSP PGUID b f f 3831 3831 16 3885 3884 range_ge rangesel scalargtjoinsel ));
 DESCR("greater than or equal");
-DATA(insert OID = 3887 (  ">"	   PGNSP PGUID b f f 3831 3831 16 3884 3885 range_gt scalargtsel scalargtjoinsel ));
+#define OID_RANGE_GREATER_OP 3886
+DATA(insert OID = 3887 (  ">"	   PGNSP PGUID b f f 3831 3831 16 3884 3885 range_gt rangesel scalargtjoinsel ));
 DESCR("greater than");
-DATA(insert OID = 3888 (  "&&"	   PGNSP PGUID b f f 3831 3831 16 3888 0 range_overlaps areasel areajoinsel ));
+#define OID_RANGE_GREATER_EQUAL_OP 3887
+DATA(insert OID = 3888 (  "&&"	   PGNSP PGUID b f f 3831 3831 16 3888 0 range_overlaps rangesel areajoinsel ));
 DESCR("overlaps");
-DATA(insert OID = 3889 (  "@>"	   PGNSP PGUID b f f 3831 2283 16 3891 0 range_contains_elem contsel contjoinsel ));
+#define OID_RANGE_OVERLAP_OP 3888
+DATA(insert OID = 3889 (  "@>"	   PGNSP PGUID b f f 3831 2283 16 3891 0 range_contains_elem rangesel contjoinsel ));
 DESCR("contains");
-DATA(insert OID = 3890 (  "@>"	   PGNSP PGUID b f f 3831 3831 16 3892 0 range_contains contsel contjoinsel ));
+#define OID_RANGE_CONTAINS_ELEM_OP 3889
+DATA(insert OID = 3890 (  "@>"	   PGNSP PGUID b f f 3831 3831 16 3892 0 range_contains rangesel contjoinsel ));
 DESCR("contains");
-DATA(insert OID = 3891 (  "<@"	   PGNSP PGUID b f f 2283 3831 16 3889 0 elem_contained_by_range contsel contjoinsel ));
+#define OID_RANGE_CONTAINS_OP 3890
+DATA(insert OID = 3891 (  "<@"	   PGNSP PGUID b f f 2283 3831 16 3889 0 elem_contained_by_range rangesel contjoinsel ));
 DESCR("is contained by");
-DATA(insert OID = 3892 (  "<@"	   PGNSP PGUID b f f 3831 3831 16 3890 0 range_contained_by contsel contjoinsel ));
+#define OID_RANGE_ELEM_CONTAINED_OP 3891
+DATA(insert OID = 3892 (  "<@"	   PGNSP PGUID b f f 3831 3831 16 3890 0 range_contained_by rangesel contjoinsel ));
 DESCR("is contained by");
-DATA(insert OID = 3893 (  "<<"	   PGNSP PGUID b f f 3831 3831 16 3894 0 range_before scalarltsel scalarltjoinsel ));
+#define OID_RANGE_CONTAINED_OP 3892
+DATA(insert OID = 3893 (  "<<"	   PGNSP PGUID b f f 3831 3831 16 3894 0 range_before rangesel scalarltjoinsel ));
 DESCR("is left of");
-DATA(insert OID = 3894 (  ">>"	   PGNSP PGUID b f f 3831 3831 16 3893 0 range_after scalargtsel scalargtjoinsel ));
+#define OID_RANGE_LEFT_OP 3893
+DATA(insert OID = 3894 (  ">>"	   PGNSP PGUID b f f 3831 3831 16 3893 0 range_after rangesel scalargtjoinsel ));
 DESCR("is right of");
-DATA(insert OID = 3895 (  "&<"	   PGNSP PGUID b f f 3831 3831 16 0 0 range_overleft scalarltsel scalarltjoinsel ));
+#define OID_RANGE_RIGHT_OP 3894
+DATA(insert OID = 3895 (  "&<"	   PGNSP PGUID b f f 3831 3831 16 0 0 range_overleft rangesel scalarltjoinsel ));
 DESCR("overlaps or is left of");
-DATA(insert OID = 3896 (  "&>"	   PGNSP PGUID b f f 3831 3831 16 0 0 range_overright scalargtsel scalargtjoinsel ));
+#define OID_RANGE_OVERLAPS_LEFT_OP 3895
+DATA(insert OID = 3896 (  "&>"	   PGNSP PGUID b f f 3831 3831 16 0 0 range_overright rangesel scalargtjoinsel ));
 DESCR("overlaps or is right of");
+#define OID_RANGE_OVERLAPS_RIGHT_OP 3896
 DATA(insert OID = 3897 (  "-|-"    PGNSP PGUID b f f 3831 3831 16 3897 0 range_adjacent contsel contjoinsel ));
 DESCR("is adjacent to");
 DATA(insert OID = 3898 (  "+"	   PGNSP PGUID b f f 3831 3831 3831 3898 0 range_union - - ));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 51449a4..77a3b41 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4544,6 +4544,8 @@ DATA(insert OID = 3902 (  hash_range			PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0
 DESCR("hash a range");
 DATA(insert OID = 3916 (  range_typanalyze		PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ range_typanalyze _null_ _null_ _null_ ));
 DESCR("range typanalyze");
+DATA(insert OID = 3169 (  rangesel				PGNSP PGUID 12 1 0 0 0 f f f f t f s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ rangesel _null_ _null_ _null_ ));
+DESCR("restriction selectivity for range operators");
 
 DATA(insert OID = 3914 (  int4range_canonical		   PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 3904 "3904" _null_ _null_ _null_ _null_ int4range_canonical _null_ _null_ _null_ ));
 DESCR("convert an int4 range to canonical form");
diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h
index 8724cb5..e63b5e8 100644
--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -268,4 +268,19 @@ typedef FormData_pg_statistic *Form_pg_statistic;
  */
 #define STATISTIC_KIND_DECHIST	5
 
+/*
+ * A "empty frac" slot is the fraction of empty ranges in a range-type column.
+ * stavalues is not used and should be NULL. stanumbers contains a single
+ * entry, the fraction of empty ranges (0.0 to 1.0).
+ */
+#define STATISTIC_KIND_RANGE_EMPTY_FRAC	6
+
+/*
+ * A histogram similar to STATISTIC_KIND_HISTOGRAM, but the lower and upper
+ * bound values are calculated and sorted separately. In other words, this is
+ * actually two histograms combined into a single array of ranges. Contains
+ * only non-empty ranges.
+ */
+#define STATISTIC_KIND_BOUNDS_HISTOGRAM	7
+
 #endif   /* PG_STATISTIC_H */
diff --git a/src/include/utils/rangetypes.h b/src/include/utils/rangetypes.h
index fb0b23b..a7eeda0 100644
--- a/src/include/utils/rangetypes.h
+++ b/src/include/utils/rangetypes.h
@@ -170,6 +170,7 @@ extern Datum hash_range(PG_FUNCTION_ARGS);
 
 /* ANALYZE support */
 extern Datum range_typanalyze(PG_FUNCTION_ARGS);
+extern Datum rangesel(PG_FUNCTION_ARGS);
 
 /* Canonical functions */
 extern Datum int4range_canonical(PG_FUNCTION_ARGS);

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Statistics and selectivity estimation for ranges

Reply via email to