Heikki Linnakangas wrote:
Jan Urbański wrote:
So right now the idea is to:
 (1) pre-sort STATISTIC_KIND_MCELEM values
 (2) build an array of pointers to detoasted values in tssel()
 (3) use binary search when looking for MCELEMs during tsquery analysis

Sounds like a plan. In (2), it's even better to detoast the values lazily. For a typical one-word tsquery, the binary search will only look at a small portion of the elements.

Here's another version.

Most common lexemes get sorted before storing in pg_statistic. The ordering is on length first and value second. That way we can avoid strncmp() calls when the lexemes have different lengths (and lexemes know their lengths, so the data is readily available). Also, in the binary search routine during selectivity estimation we can sometimes avoid detoasting (I think) Datums from the pg_statistic MCELEM array. See comments in code.

Pre-sorting introduced one problem (see XXX in code): it's not easy anymore to get the minimal frequency of MCELEM values. I was using it to assert that the selectivity of a tsquery node containing a lexeme not in MCELEM is no more that min(MCELEM freqs) / 2. That's only significant when the minimum frequency is less than DEFAULT_TS_SEL * 2, so I'm kind of inclined to ignore it and maybe drop a comment in the code that this may be a potential problem.

If nothing is fundamentally broken with this, I'll repeat my profiling tests to see if anything has been gained.

Cheers,
Jan

--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin
diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile
index e20a4a2..ba728eb 100644
--- a/src/backend/tsearch/Makefile
+++ b/src/backend/tsearch/Makefile
@@ -19,7 +19,7 @@ DICTFILES=synonym_sample.syn thesaurus_sample.ths 
hunspell_sample.affix \
 OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
        dict_simple.o dict_synonym.o dict_thesaurus.o \
        dict_ispell.o regis.o spell.o \
-       to_tsany.o ts_typanalyze.o ts_utils.o
+       to_tsany.o ts_typanalyze.o ts_selfuncs.o ts_utils.o
 
 include $(top_srcdir)/src/backend/common.mk
 
diff --git a/src/backend/tsearch/ts_selfuncs.c 
b/src/backend/tsearch/ts_selfuncs.c
new file mode 100644
index 0000000..0fadc60
--- /dev/null
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -0,0 +1,320 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_selfuncs.c
+ *       Selectivity functions for text search types.
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *       $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h" /* for check_stack_depth() */
+#include "utils/memutils.h"
+#include "utils/builtins.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
+#include "catalog/pg_type.h"
+#include "catalog/pg_statistic.h"
+#include "nodes/nodes.h"
+#include "tsearch/ts_type.h"
+
+/* lookup table type for binary searching through MCELEMs */
+typedef struct
+{
+       Datum   element;
+       float4  frequency;
+} TextFreq;
+
+/* type of keys for bsearch()ing through an array of TextFreqs  */
+typedef struct
+{
+       char    *lexeme;
+       int             length;
+} LexemeKey;
+
+static int
+compare_lexeme_textfreq(const void *e1, const void *e2);
+
+static Selectivity
+tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                                 int length, float4 minfreq);
+static Selectivity
+mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                                                  float4 *numbers, int 
nnumbers);
+static double
+tsquerysel(VariableStatData *vardata, Datum constval);
+
+
+/* TSQuery traversal function */
+static Selectivity
+tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                                 int length, float4 minfreq)
+{
+       LexemeKey       key;
+       TextFreq        *searchres;
+       Selectivity     s1, s2;
+
+       /* since this function recurses, it could be driven to stack overflow */
+       check_stack_depth();
+
+       if (item->type == QI_VAL)
+       {
+               QueryOperand *oper = (QueryOperand *) item;
+
+               /*
+                * Prepare the key for bsearch().
+                */
+               key.lexeme = operand + oper->distance;
+               key.length = oper->length;
+
+               searchres = (TextFreq *) bsearch(&key, lookup, length,
+                                                                               
 sizeof(TextFreq), compare_lexeme_textfreq);
+
+               if (searchres)
+               {
+                       /*
+                        * The element is in MCELEM. Return precise selectivity 
(or at
+                        * least as precise, as ANALYZE could find out).
+                        */
+                       return (Selectivity) searchres->frequency;
+               }
+               else
+               {
+                       /*
+                        * The element is not in MCELEM. Punt, but assert that 
the
+                        * selectivity cannot be more than minfreq / 2.
+                        */
+                       return (Selectivity) Min(DEFAULT_TS_SEL, minfreq / 2);
+               }
+       }
+
+       /* Current TSQuery node is an operator */
+       switch (item->operator.oper)
+       {
+               case OP_NOT:
+                       return 1.0 - tsquery_opr_selec(item + 1, operand, 
lookup,
+                                                                               
   length, minfreq);
+
+               case OP_AND:
+                       return
+                               tsquery_opr_selec(item + 1, operand, lookup, 
length, minfreq) *
+                               tsquery_opr_selec(item + item->operator.left, 
operand, lookup,
+                                                                 length, 
minfreq);
+
+               case OP_OR:
+                       s1 = tsquery_opr_selec(item + 1, operand, lookup, 
length, minfreq);
+                       s2 = tsquery_opr_selec(item + item->operator.left, 
operand, lookup,
+                                                                  length, 
minfreq);
+                       return s1 + s2 - s1 * s2;
+
+               default:
+                       elog(ERROR, "unrecognized operator: %d", 
item->operator.oper);
+       }
+
+       /* never reached, keep compiler happy */
+       return (Selectivity) DEFAULT_TS_SEL;
+}
+
+/*
+ * Traverse the tsquery preorder, calculating selectivity as:
+ *
+ *   selec(left_oper) * selec(right_oper) in AND nodes,
+ *
+ *   selec(left_oper) + selec(right_oper) -
+ *      selec(left_oper) * selec(right_oper) in OR nodes,
+ *
+ *   1 - select(oper) in NOT nodes
+ *
+ *   freq[val] in VAL nodes, if the value is in MCELEM
+ *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
+ *
+ *
+ * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
+ * binary search for determining freq[MCELEM].
+ */
+static Selectivity
+mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                                                  float4 *numbers, int 
nnumbers)
+{
+       float4                  minfreq;
+       TextFreq                *lookup;
+       Selectivity             selec;
+       int                             i;
+
+       /* Grab the lowest frequency */
+
+       /*
+         XXX This is no longer easy, need to go through the whole list. Plug it
+         with 2.0 for the moment (so that minfreq / 2 is 1.0)
+
+         minfreq = numbers[nnumbers - 1];
+       */
+       minfreq = 2.0;
+
+       /* Construct the array for binary search */
+       Assert(nmcelem == nnumbers);
+       lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
+       for (i = 0; i < nmcelem; i++)
+       {
+               lookup[i].element = mcelem[i];
+               lookup[i].frequency = numbers[i];
+       }
+
+       selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
+                                                         nmcelem, minfreq);
+
+       pfree(lookup);
+
+       return selec;
+}
+
+/*
+ * bsearch() comparator for a lexeme (non-NULL terminated string with length)
+ * and a TextFreq. Use length, then byte-for-byte comparision, because that's
+ * how ANALYZE code sorted data before storing it in a statistic tuple.
+ * See ts_typanalyze.c for details.
+ */
+static int
+compare_lexeme_textfreq(const void *e1, const void *e2)
+{
+       const LexemeKey *key;
+       const TextFreq  *t;
+       text                    *element;
+       int                             len1,
+                                       len2;
+
+       key = (const LexemeKey *) e1;
+       t = (const TextFreq *) e2;
+
+       len1 = key->length;
+       len2 = VARSIZE_ANY_EXHDR(t->element);
+
+       /* Compare lengths first, possibly avoiding pointless detoasting */
+       if (len1 > len2)
+               return 1;
+       else if (len1 < len2)
+               return -1;
+
+       /* Fall back on byte-for-byte comparision, detoasting the Datum first */
+       element = DatumGetTextP(t->element);
+
+       return strncmp(key->lexeme, VARDATA_ANY(element), len1);
+}
+
+/*
+ *     tssel -- Selectivity of "@@"
+ */
+Datum
+tssel(PG_FUNCTION_ARGS)
+{
+       PlannerInfo             *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+       /* We skip arg #2, which is the operator OID - it's gonna be "@@" */
+       List                    *args = (List *) PG_GETARG_POINTER(2);
+       int                             varRelid = PG_GETARG_INT32(3);
+       VariableStatData vardata;
+       Node                    *other;
+       bool                    varonleft;
+       Selectivity             selec;
+
+       /*
+        * If expression is not variable op something or something op variable,
+        * then punt and return a default estimate.
+        */
+       if (!get_restriction_variable(root, args, varRelid,
+                                                                 &vardata, 
&other, &varonleft))
+       {
+               PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+       }
+
+       /*
+        * Can't do anything useful if the something is not a constant, either.
+        */
+       if (!IsA(other, Const))
+       {
+               ReleaseVariableStats(vardata);
+               PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+       }
+
+       /* The "@@" operator is strict, so might cope with NULL right away */
+       if (((Const *) other)->constisnull) {
+               ReleaseVariableStats(vardata);
+               PG_RETURN_FLOAT8((float8) 0.0);
+       }
+
+       /*
+        * OK, there's a Var and a Const we're dealing with here. We need the 
Var
+        * to be a TSVector (or else we don't have any useful statistic for it).
+        */
+
+       if (vardata.vartype == TSVECTOROID)
+       {
+               /* tsvector @@ tsquery or the other way around */
+               Assert(((Const *) other)->consttype == TSQUERYOID);
+
+               selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
+       }
+       else
+       {
+               /* The Var is something we don't have useful statistic for */
+               selec = DEFAULT_TS_SEL;
+       }
+
+       ReleaseVariableStats(vardata);
+
+       PG_RETURN_FLOAT8((float8) selec);
+}
+
+static Selectivity
+tsquerysel(VariableStatData *vardata, Datum constval)
+{
+       Selectivity                     selec;
+
+       if (HeapTupleIsValid(vardata->statsTuple))
+       {
+               TSQuery                         query;
+               Form_pg_statistic       stats;
+               Datum                           *values;
+               int                                     nvalues;
+               float4                          *numbers;
+               int                                     nnumbers;
+
+               /* The caller made sure the const is a TSQuery, so get it now */
+               query = DatumGetTSQuery(constval);
+
+               stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+
+               /* MCELEM will be an array of Text elements for a tsvector 
column */
+               if (get_attstatsslot(vardata->statsTuple,
+                                                        TEXTOID, -1,
+                                                        STATISTIC_KIND_MCELEM, 
InvalidOid,
+                                                        &values, &nvalues,
+                                                        &numbers, &nnumbers))
+               {
+                       /*
+                        * There is a most-common-elements slot for the 
tsvector Var, so
+                        * use that.
+                        */
+
+                       selec = mcelem_tsquery_selec(query, values, nvalues,
+                                                                               
 numbers, nnumbers);
+                       free_attstatsslot(TEXTOID, values, nvalues, numbers, 
nnumbers);
+               }
+               else
+               {
+                       /* No most-common-elements slot */
+                       selec = (Selectivity) DEFAULT_TS_SEL;
+               }
+       }
+       else
+       {
+               selec = (Selectivity) DEFAULT_TS_SEL;
+       }
+
+       return selec;
+}
diff --git a/src/backend/tsearch/ts_typanalyze.c 
b/src/backend/tsearch/ts_typanalyze.c
index 965e758..201c4ab 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -43,8 +43,9 @@ static void compute_tsvector_stats(VacAttrStats *stats,
 static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current);
 static uint32 lexeme_hash(const void *key, Size keysize);
 static int lexeme_match(const void *key1, const void *key2, Size keysize);
-static int trackitem_compare_desc(const void *e1, const void *e2);
-
+static int lexeme_compare(const void *key1, const void *key2);
+static int trackitem_compare_frequencies_desc(const void *e1, const void *e2);
+static int trackitem_compare_lexemes(const void *e1, const void *e2);
 
 /*
  *     ts_typanalyze -- a custom typanalyze function for tsvector columns
@@ -273,7 +274,7 @@ compute_tsvector_stats(VacAttrStats *stats,
                Assert(i == track_len);
 
                qsort(sort_table, track_len, sizeof(TrackItem *),
-                         trackitem_compare_desc);
+                         trackitem_compare_frequencies_desc);
 
                /* Suppress any single-occurrence items */
                while (track_len > 0)
@@ -287,6 +288,22 @@ compute_tsvector_stats(VacAttrStats *stats,
                if (num_mcelem > track_len)
                        num_mcelem = track_len;
 
+               /*
+                * We want to store statistics sorted on the lexeme value using 
first
+                * length, then byte-for-byte comparision. The reason for doing 
length
+                * comparision first is that we don't care about the ordering 
as long
+                * as it's consistent and comparing lengths first gives us a 
chance to
+                * avoid a strncmp() call.
+                *
+                * This is different from what we do with scalar statistics -- 
they get
+                * sorted on frequencies. The rationale is that we usually 
search
+                * through most common elements looking for a specific value, 
so we can
+                * grab its frequency. When values are presorted we can employ 
binary
+                * search for that. See ts_selfuncs.c for a real usage scenario.
+                */
+               qsort(sort_table, num_mcelem, sizeof(TrackItem *),
+                         trackitem_compare_lexemes);
+
                /* Generate MCELEM slot entry */
                if (num_mcelem > 0)
                {
@@ -379,25 +396,49 @@ lexeme_hash(const void *key, Size keysize)
 static int
 lexeme_match(const void *key1, const void *key2, Size keysize)
 {
-       const LexemeHashKey *d1 = (const LexemeHashKey *) key1;
-       const LexemeHashKey *d2 = (const LexemeHashKey *) key2;
+       /* The keysize parameter is superfluous, the keys store their lengths */
+       return lexeme_compare(key1, key2);
+}
 
-       /* The lexemes need to have the same length, and be memcmp-equal */
-       if (d1->length == d2->length &&
-               memcmp(d1->lexeme, d2->lexeme, d1->length) == 0)
-               return 0;
-       else
+/*
+ *     Comparision function for lexemes.
+ */
+static int
+lexeme_compare(const void *key1, const void *key2)
+{
+       const LexemeHashKey     *d1 = (const LexemeHashKey *) key1;
+       const LexemeHashKey     *d2 = (const LexemeHashKey *) key2;
+
+       /* First, compare by length */
+       if (d1->length > d2->length)
                return 1;
+       else if (d1->length < d2->length)
+               return -1;
+       else
+               /* Lengths are equal, do a byte-for-byte comparision */
+               return strncmp(d1->lexeme, d2->lexeme, d1->length);
 }
 
 /*
- *     qsort() comparator for TrackItems - LC style (descending sort)
+ *     qsort() comparator for sorting TrackItems on frequencies (descending 
sort)
  */
 static int
-trackitem_compare_desc(const void *e1, const void *e2)
+trackitem_compare_frequencies_desc(const void *e1, const void *e2)
 {
        const TrackItem * const *t1 = (const TrackItem * const *) e1;
        const TrackItem * const *t2 = (const TrackItem * const *) e2;
 
        return (*t2)->frequency - (*t1)->frequency;
 }
+
+/*
+ *     qsort() comparator for sorting TrackItem on lexemes
+ */
+static int
+trackitem_compare_lexemes(const void *e1, const void *e2)
+{
+       const TrackItem * const *t1 = (const TrackItem * const *) e1;
+       const TrackItem * const *t2 = (const TrackItem * const *) e2;
+
+       return lexeme_compare(&(*t1)->key, &(*t2)->key);
+}
diff --git a/src/include/catalog/pg_operator.h 
b/src/include/catalog/pg_operator.h
index 1c7f37d..451805d 100644
--- a/src/include/catalog/pg_operator.h
+++ b/src/include/catalog/pg_operator.h
@@ -915,10 +915,10 @@ DATA(insert OID = 3630 (  "<>"       PGNSP PGUID b f f 
3614        3614    16 3630 3629    ts
 DATA(insert OID = 3631 (  ">="    PGNSP PGUID b f f 3614        3614    16 
3628 3627    tsvector_ge scalargtsel scalargtjoinsel ));
 DATA(insert OID = 3632 (  ">"     PGNSP PGUID b f f 3614        3614    16 
3627 3628    tsvector_gt scalargtsel scalargtjoinsel ));
 DATA(insert OID = 3633 (  "||"    PGNSP PGUID b f f 3614        3614    3614  
0        0        tsvector_concat   -    -         ));
-DATA(insert OID = 3636 (  "@@"    PGNSP PGUID b f f 3614        3615    16 
3637        0        ts_match_vq   contsel     contjoinsel   ));
-DATA(insert OID = 3637 (  "@@"    PGNSP PGUID b f f 3615        3614    16 
3636        0        ts_match_qv   contsel     contjoinsel   ));
-DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614       3615    16 
3661        0        ts_match_vq   contsel     contjoinsel   ));
-DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615       3614    16 
3660        0        ts_match_qv   contsel     contjoinsel   ));
+DATA(insert OID = 3636 (  "@@"    PGNSP PGUID b f f 3614        3615    16 
3637        0        ts_match_vq   tssel       contjoinsel   ));
+DATA(insert OID = 3637 (  "@@"    PGNSP PGUID b f f 3615        3614    16 
3636        0        ts_match_qv   tssel       contjoinsel   ));
+DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614       3615    16 
3661        0        ts_match_vq   tssel       contjoinsel   ));
+DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615       3614    16 
3660        0        ts_match_qv   tssel       contjoinsel   ));
 DATA(insert OID = 3674 (  "<"     PGNSP PGUID b f f 3615        3615    16 
3679 3678    tsquery_lt scalarltsel scalarltjoinsel ));
 DATA(insert OID = 3675 (  "<="    PGNSP PGUID b f f 3615        3615    16 
3678 3679    tsquery_le scalarltsel scalarltjoinsel ));
 DATA(insert OID = 3676 (  "="     PGNSP PGUID b t f 3615        3615    16 
3676 3677    tsquery_eq eqsel eqjoinsel ));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 16ccb55..ef24f40 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4321,6 +4321,8 @@ DESCR("GiST tsquery support");
 DATA(insert OID = 3701 (  gtsquery_consistent                  PGNSP PGUID 12 
1 0 0 f f t f i 5 16 "2281 2281 23 26 2281" _null_ _null_ _null_ 
gtsquery_consistent _null_ _null_ _null_ ));
 DESCR("GiST tsquery support");
 
+DATA(insert OID = 3687 (  tssel        PGNSP PGUID 12 1 0 0 f f t f s 4 701 
"2281 26 2281 23" _null_ _null_ _null_ tssel _null_ _null_ _null_ ));
+DESCR("restriction selectivity of tsvector @@ tsquery");
 DATA(insert OID = 3688 (  ts_typanalyze        PGNSP PGUID 12 1 0 0 f f t f s 
1 16 "2281" _null_ _null_ _null_ ts_typanalyze _null_ _null_ _null_ ));
 DESCR("tsvector typanalyze");
 
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 06ca1f9..b253327 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -155,6 +155,8 @@ extern Datum ts_rankcd_wttf(PG_FUNCTION_ARGS);
 
 extern Datum ts_typanalyze(PG_FUNCTION_ARGS);
 
+extern Datum tssel(PG_FUNCTION_ARGS);
+
 
 /*
  * TSQuery
@@ -291,4 +293,17 @@ extern Datum tsquery_rewrite_query(PG_FUNCTION_ARGS);
 extern Datum tsq_mcontains(PG_FUNCTION_ARGS);
 extern Datum tsq_mcontained(PG_FUNCTION_ARGS);
 
+/*
+ * The default text search selectivity is chosen to be smll enough to
+ * encourage indexscans for typical table densities. See selfuncs.h and
+ * DEFAULT_EQ_SEL for details.
+ */
+#define DEFAULT_TS_SEL 0.005
+
+/*
+ * operator restriction function for tsvector @@ tsquery and
+ * tsquery @@ tsvector
+ */
+extern Datum tssel(PG_FUNCTION_ARGS);
+
 #endif   /* _PG_TSTYPE_H_ */
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to