Hi,
I know Commit Fest is in progress, as well as the holiday season. But
the Summer of Code ends in about three weeks, so I'd like to request a
bit of out-of-order processing :)
My previous mail sent to -hackers is here:
http://archives.postgresql.org/message-id/[EMAIL PROTECTED]
I had problems applying the patch from that mail (it got mangled
somehow, could by my mail agent...), so I'm attaching it again.
There are two things that I'm not particularly proud of in the patch.
First is palloc()ing and filling in a table simply to user qsort() on
it. I guess I could either hack up a version of get_attstatsslot() that
returns an array of (element, frequency) pairs or sort the elements by
hand instead of using qsort() and keep the order of frequencies in sync
while doing the sort.
Another thing are cstring_to_text_with_len calls. I'm doing them so I
can use bttextcmp in bsearch(). I think I could come up with a dedicated
function to return text Datums and WordEntries (read: non-NULL
terminated strings with a given length).
Are these unsignificant? Or should I do these optimizations? Or, sadly,
signs that using binary search is not a good decision?
Thanks,
Jan
--
Jan Urbanski
GPG key ID: E583D7D2
ouden estin
diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile
index e20a4a2..ba728eb 100644
--- a/src/backend/tsearch/Makefile
+++ b/src/backend/tsearch/Makefile
@@ -19,7 +19,7 @@ DICTFILES=synonym_sample.syn thesaurus_sample.ths
hunspell_sample.affix \
OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
dict_simple.o dict_synonym.o dict_thesaurus.o \
dict_ispell.o regis.o spell.o \
- to_tsany.o ts_typanalyze.o ts_utils.o
+ to_tsany.o ts_typanalyze.o ts_selfuncs.o ts_utils.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/tsearch/ts_selfuncs.c
b/src/backend/tsearch/ts_selfuncs.c
new file mode 100644
index 0000000..4b86072
--- /dev/null
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -0,0 +1,299 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_selfuncs.c
+ * Selectivity functions for text search types.
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h" /* for check_stack_depth() */
+#include "utils/memutils.h"
+#include "utils/builtins.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
+#include "catalog/pg_type.h"
+#include "catalog/pg_statistic.h"
+#include "nodes/nodes.h"
+#include "tsearch/ts_type.h"
+
+/* lookup table type for binary searching through MCELEMs */
+typedef struct
+{
+ Datum element;
+ float4 frequency;
+} TextFreq;
+
+static int
+compare_textfreq(const void *e1, const void *e2);
+static Selectivity
+tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+ int length, float4 minfreq);
+static Selectivity
+mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+ float4 *numbers, int
nnumbers);
+static double
+tsquerysel(VariableStatData *vardata, Datum constval);
+
+
+/* TSQuery traversal function */
+static Selectivity
+tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+ int length, float4 minfreq)
+{
+ TextFreq key,
+ *searchres;
+ Selectivity s1, s2;
+
+ /* since this function recurses, it could be driven to stack overflow */
+ check_stack_depth();
+
+ if (item->type == QI_VAL)
+ {
+ QueryOperand *oper = (QueryOperand *) item;
+
+ /*
+ * Prepare the key for bsearch(). No need to initialize
key.frequency,
+ * because sorting is only on key.element.
+ */
+ key.element = PointerGetDatum(
+ cstring_to_text_with_len(operand + oper->distance,
oper->length));
+
+ searchres = (TextFreq *) bsearch(&key, lookup, length,
+
sizeof(TextFreq), compare_textfreq);
+ if (searchres)
+ {
+ /*
+ * The element is in MCELEM. Return precise selectivity
(or at
+ * least as precise, as ANALYZE could find out).
+ */
+ return (Selectivity) searchres->frequency;
+ }
+ else
+ {
+ /*
+ * The element is not in MCELEM. Punt, but assure that
the
+ * selectivity cannot be more than minfreq / 2.
+ */
+ return (Selectivity) Min(DEFAULT_TS_SEL, minfreq / 2);
+ }
+ }
+
+ /* Current TSQuery node is an operator */
+ switch (item->operator.oper)
+ {
+ case OP_NOT:
+ return 1.0 - tsquery_opr_selec(item + 1, operand,
lookup,
+
length, minfreq);
+
+ case OP_AND:
+ return
+ tsquery_opr_selec(item + 1, operand, lookup,
length, minfreq) *
+ tsquery_opr_selec(item + item->operator.left,
operand, lookup,
+ length,
minfreq);
+
+ case OP_OR:
+ s1 = tsquery_opr_selec(item + 1, operand, lookup,
length, minfreq);
+ s2 = tsquery_opr_selec(item + item->operator.left,
operand, lookup,
+ length,
minfreq);
+ return s1 + s2 - s1 * s2;
+
+ default:
+ elog(ERROR, "unrecognized operator: %d",
item->operator.oper);
+ }
+
+ /* never reached, keep compiler happy */
+ return (Selectivity) DEFAULT_TS_SEL;
+}
+
+/*
+ * Traverse the tsquery preorder, calculating selectivity as:
+ *
+ * selec(left_oper) * selec(right_oper) in AND nodes,
+ *
+ * selec(left_oper) + selec(right_oper) -
+ * selec(left_oper) * selec(right_oper) in OR nodes,
+ *
+ * 1 - select(oper) in NOT nodes
+ *
+ * freq[val] in VAL nodes, if the value is in MCELEM
+ * min(freq[MCELEM]) / 2 in VAL nodes, if it is not
+ *
+ *
+ * Implementation-wise, we sort the MCELEM array to use binary
+ * search on it.
+ */
+static Selectivity
+mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+ float4 *numbers, int
nnumbers)
+{
+ float4 minfreq;
+ TextFreq *lookup;
+ Selectivity selec;
+ MemoryContext opr_context,
+ old_context;
+ int i;
+
+ /* Grab the lowest frequency */
+ minfreq = numbers[nnumbers - 1];
+
+ /*
+ * Compute selectivity in child context, because we will possibly do a
lot
+ * of QueryOperand to text transformations, which palloc() memory
+ */
+ opr_context = AllocSetContextCreate(CurrentMemoryContext,
+
"Text search restriction",
+
ALLOCSET_DEFAULT_MINSIZE,
+
ALLOCSET_DEFAULT_INITSIZE,
+
ALLOCSET_DEFAULT_MAXSIZE);
+ old_context = MemoryContextSwitchTo(opr_context);
+
+
+ /* Construct the array to be qsort()'ed */
+ Assert(nmcelem == nnumbers);
+ lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
+ for (i = 0; i < nmcelem; i++)
+ {
+ lookup[i].element = mcelem[i];
+ lookup[i].frequency = numbers[i];
+ }
+
+ qsort(lookup, nmcelem, sizeof(TextFreq), compare_textfreq);
+
+ selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
+ nmcelem, minfreq);
+
+ MemoryContextSwitchTo(old_context);
+ MemoryContextDelete(opr_context);
+
+ return selec;
+}
+
+static int
+compare_textfreq(const void *e1, const void *e2)
+{
+ const TextFreq *t1 = (const TextFreq *) e1;
+ const TextFreq *t2 = (const TextFreq *) e2;
+
+ return DatumGetInt32(DirectFunctionCall2(bttextcmp,
+
t1->element, t2->element));
+}
+
+/*
+ * tssel -- Selectivity of "@@"
+ */
+Datum
+tssel(PG_FUNCTION_ARGS)
+{
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ /* We skip arg #2, which is the operator OID - it's gonna be "@@" */
+ List *args = (List *) PG_GETARG_POINTER(2);
+ int varRelid = PG_GETARG_INT32(3);
+ VariableStatData vardata;
+ Node *other;
+ bool varonleft;
+ Selectivity selec;
+
+ /*
+ * If expression is not variable op something or something op variable,
+ * then punt and return a default estimate.
+ */
+ if (!get_restriction_variable(root, args, varRelid,
+ &vardata,
&other, &varonleft))
+ {
+ PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+ }
+
+ /*
+ * Can't do anything useful if the something is not a constant, either.
+ */
+ if (!IsA(other, Const))
+ {
+ ReleaseVariableStats(vardata);
+ PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+ }
+
+ /* The "@@" operator is strict, so might cope with NULL right away */
+ if (((Const *) other)->constisnull) {
+ ReleaseVariableStats(vardata);
+ PG_RETURN_FLOAT8((float8) 0.0);
+ }
+
+ /*
+ * OK, there's a Var and a Const we're dealing with here. We need the
Var
+ * to be a TSVector (or else we don't have any useful statistic for it).
+ */
+
+ if (vardata.vartype == TSVECTOROID)
+ {
+ /* tsvector @@ tsquery or the other way around */
+ Assert(((Const *) other)->consttype == TSQUERYOID);
+
+ selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
+ }
+ else
+ {
+ /* The Var is something we don't have useful statistic for */
+ selec = (Selectivity) DEFAULT_TS_SEL;
+ }
+
+ ReleaseVariableStats(vardata);
+
+ PG_RETURN_FLOAT8((float8) selec);
+}
+
+static Selectivity
+tsquerysel(VariableStatData *vardata, Datum constval)
+{
+ Selectivity selec;
+
+ if (HeapTupleIsValid(vardata->statsTuple))
+ {
+ TSQuery query;
+ Form_pg_statistic stats;
+ Datum *values;
+ int nvalues;
+ float4 *numbers;
+ int nnumbers;
+
+ /* The caller made sure the const is a TSQuery, so get it now */
+ query = DatumGetTSQuery(constval);
+
+ stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+
+ /* MCELEM will be an array of Text elements for a tsvector
column */
+ if (get_attstatsslot(vardata->statsTuple,
+ TEXTOID, -1,
+ STATISTIC_KIND_MCELEM,
InvalidOid,
+ &values, &nvalues,
+ &numbers, &nnumbers))
+ {
+ /*
+ * There is a most-common-elements slot for the
tsvector Var, so
+ * use that.
+ */
+
+ selec = mcelem_tsquery_selec(query, values, nvalues,
+
numbers, nnumbers);
+ free_attstatsslot(TEXTOID, values, nvalues, numbers,
nnumbers);
+ }
+ else
+ {
+ /* No most-common-elements slot */
+ selec = (Selectivity) DEFAULT_TS_SEL;
+ }
+ }
+ else
+ {
+ selec = (Selectivity) DEFAULT_TS_SEL;
+ }
+
+ return selec;
+}
diff --git a/src/include/catalog/pg_operator.h
b/src/include/catalog/pg_operator.h
index 1c7f37d..451805d 100644
--- a/src/include/catalog/pg_operator.h
+++ b/src/include/catalog/pg_operator.h
@@ -915,10 +915,10 @@ DATA(insert OID = 3630 ( "<>" PGNSP PGUID b f f
3614 3614 16 3630 3629 ts
DATA(insert OID = 3631 ( ">=" PGNSP PGUID b f f 3614 3614 16
3628 3627 tsvector_ge scalargtsel scalargtjoinsel ));
DATA(insert OID = 3632 ( ">" PGNSP PGUID b f f 3614 3614 16
3627 3628 tsvector_gt scalargtsel scalargtjoinsel ));
DATA(insert OID = 3633 ( "||" PGNSP PGUID b f f 3614 3614 3614
0 0 tsvector_concat - - ));
-DATA(insert OID = 3636 ( "@@" PGNSP PGUID b f f 3614 3615 16
3637 0 ts_match_vq contsel contjoinsel ));
-DATA(insert OID = 3637 ( "@@" PGNSP PGUID b f f 3615 3614 16
3636 0 ts_match_qv contsel contjoinsel ));
-DATA(insert OID = 3660 ( "@@@" PGNSP PGUID b f f 3614 3615 16
3661 0 ts_match_vq contsel contjoinsel ));
-DATA(insert OID = 3661 ( "@@@" PGNSP PGUID b f f 3615 3614 16
3660 0 ts_match_qv contsel contjoinsel ));
+DATA(insert OID = 3636 ( "@@" PGNSP PGUID b f f 3614 3615 16
3637 0 ts_match_vq tssel contjoinsel ));
+DATA(insert OID = 3637 ( "@@" PGNSP PGUID b f f 3615 3614 16
3636 0 ts_match_qv tssel contjoinsel ));
+DATA(insert OID = 3660 ( "@@@" PGNSP PGUID b f f 3614 3615 16
3661 0 ts_match_vq tssel contjoinsel ));
+DATA(insert OID = 3661 ( "@@@" PGNSP PGUID b f f 3615 3614 16
3660 0 ts_match_qv tssel contjoinsel ));
DATA(insert OID = 3674 ( "<" PGNSP PGUID b f f 3615 3615 16
3679 3678 tsquery_lt scalarltsel scalarltjoinsel ));
DATA(insert OID = 3675 ( "<=" PGNSP PGUID b f f 3615 3615 16
3678 3679 tsquery_le scalarltsel scalarltjoinsel ));
DATA(insert OID = 3676 ( "=" PGNSP PGUID b t f 3615 3615 16
3676 3677 tsquery_eq eqsel eqjoinsel ));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 16ccb55..ef24f40 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4321,6 +4321,8 @@ DESCR("GiST tsquery support");
DATA(insert OID = 3701 ( gtsquery_consistent PGNSP PGUID 12
1 0 0 f f t f i 5 16 "2281 2281 23 26 2281" _null_ _null_ _null_
gtsquery_consistent _null_ _null_ _null_ ));
DESCR("GiST tsquery support");
+DATA(insert OID = 3687 ( tssel PGNSP PGUID 12 1 0 0 f f t f s 4 701
"2281 26 2281 23" _null_ _null_ _null_ tssel _null_ _null_ _null_ ));
+DESCR("restriction selectivity of tsvector @@ tsquery");
DATA(insert OID = 3688 ( ts_typanalyze PGNSP PGUID 12 1 0 0 f f t f s
1 16 "2281" _null_ _null_ _null_ ts_typanalyze _null_ _null_ _null_ ));
DESCR("tsvector typanalyze");
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 06ca1f9..6208432 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -291,4 +291,17 @@ extern Datum tsquery_rewrite_query(PG_FUNCTION_ARGS);
extern Datum tsq_mcontains(PG_FUNCTION_ARGS);
extern Datum tsq_mcontained(PG_FUNCTION_ARGS);
+/*
+ * The default text search selectivity is chosen to be smll enough to
+ * encourage indexscans for typical table densities. See selfuncs.h and
+ * DEFAULT_EQ_SEL for details.
+ */
+#define DEFAULT_TS_SEL 0.005
+
+/*
+ * operator restriction function for tsvector @@ tsquery and
+ * tsquery @@ tsvector
+ */
+extern Datum tssel(PG_FUNCTION_ARGS);
+
#endif /* _PG_TSTYPE_H_ */
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers