Re: [HACKERS] gsoc, oprrest function for text search take 2

Jan Urbański Fri, 19 Sep 2008 09:25:00 -0700

[EMAIL PROTECTED] wrote:

Quoting Tom Lane <[EMAIL PROTECTED]>:

I wrote:

...  One possibly
performance-relevant point is to use DatumGetTextPP for detoasting;
you've already paid the costs by using VARDATA_ANY etc, so you might
as well get the benefit.


Actually, wait a second.  That code doesn't work at all on toasted data,
because it's trying to use VARSIZE_ANY_EXHDR() before detoasting.
That would give you the physical datum size (eg the size of the toast
pointer), not the number you need.

However, this is actually not a problem because we know that the data
came from an array in pg_statistic, which means the individual members
*can't be toasted*.  At least they can't be compressed or out-of-line.
We'd do that at the array level, it's not sensible to do it on an
individual array member.

I think that right at the moment the array stuff doesn't permit short
headers either, but it would make sense to relax that someday.  So I'd
recommend that your code allow either regular or short headers, but not
worry about compression or out-of-line storage.

Which boils down to: keep using VARSIZE_ANY_EXHDR/VARDATA_ANY, but
forget the "detoasting" step.  Maybe put in
    Assert(!VARATT_IS_COMPRESSED(datum) && !VARATT_IS_EXTERNAL(datum))
instead.

Well whaddya know. It turned out that my new company has a'Fridays-are-for-any-opensource-hacking-you-like' policy, so I got afull day to work on the patch.Attached is a version that stores the minimal and maximal frequencies inthe Numbers array, has the aforementioned assertion and more nicelyordered functions in ts_selfuncs.c.


I tested it with oprofile and
pgbench -n -f tssel-bench.sql -t 1000 postgres
with tssel-bench.sql containing
select * from manuals where tsvector @@ to_tsquery('foo');

"manuals" has ~700 rows and 'foo' does not appear in any of the lexemes.

The results are:
=== CVS HEAD ===
scaling factor: 1
query mode: simple
number of clients: 1
number of transactions per client: 1000
number of transactions actually processed: 1000/1000
tps = 13.399584 (including connections establishing)
tps = 13.399972 (excluding connections establishing)

74069    34.7779  pglz_decompress
38560    18.1052  tsvectorout
7688      3.6098  pg_mblen
5366      2.5195  hash_search_with_hash_value
4833      2.2693  pg_utf_mblen
4718      2.2153  AllocSetAlloc
4041      1.8974  index_getnext
3100      1.4556  LWLockAcquire
3056      1.4349  hash_any
2843      1.3349  LWLockRelease
2611      1.2260  AllocSetFree
2126      0.9982  tsCompareString
2121      0.9959  _bt_compare
1830      0.8592  LockAcquire
1517      0.7123  toast_fetch_datum
1503      0.7057  .plt
1338      0.6282  _bt_checkkeys
1332      0.6254  FunctionCall2
1233      0.5789  ReadBuffer_common
1185      0.5564  slot_deform_tuple
1157      0.5433  TParserGet
1123      0.5273  LockRelease


=== PATCH ===
transaction type: Custom query
scaling factor: 1
query mode: simple
number of clients: 1
number of transactions per client: 1000
number of transactions actually processed: 1000/1000
tps = 13.309346 (including connections establishing)
tps = 13.309761 (excluding connections establishing)

171514   35.0802  pglz_decompress
87231    17.8416  tsvectorout
17107     3.4989  pg_mblen
12514     2.5595  hash_search_with_hash_value
11124     2.2752  pg_utf_mblen
10739     2.1965  AllocSetAlloc
8534      1.7455  index_getnext
7460      1.5258  LWLockAcquire
6876      1.4064  LWLockRelease
6622      1.3544  hash_any
5773      1.1808  AllocSetFree
5210      1.0656  _bt_compare
4849      0.9918  tsCompareString
4043      0.8269  LockAcquire
3535      0.7230  .plt
3246      0.6639  _bt_checkkeys
3170      0.6484  toast_fetch_datum
3057      0.6253  FunctionCall2
2815      0.5758  ReadBuffer_common
2767      0.5659  TParserGet
2605      0.5328  slot_deform_tuple
2567      0.5250  MemoryContextAlloc

Cheers,
Jan

--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin

*** a/doc/src/sgml/catalogs.sgml
--- b/doc/src/sgml/catalogs.sgml
***************
*** 6664,6669 ****
--- 6664,6671 ----
         A list of the frequencies of the most common values or elements,
         i.e., number of occurrences of each divided by total number of rows.
         (NULL when <structfield>most_common_vals</structfield> is.)
+        For some datatypes such as <type>tsvector</>, it can also store some
+        additional information, i.e. be longer than the most_common_vals array.
        </entry>
       </row>
  
*** a/src/backend/tsearch/Makefile
--- b/src/backend/tsearch/Makefile
***************
*** 19,25 **** DICTFILES=synonym_sample.syn thesaurus_sample.ths 
hunspell_sample.affix \
  OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
        dict_simple.o dict_synonym.o dict_thesaurus.o \
        dict_ispell.o regis.o spell.o \
!       to_tsany.o ts_typanalyze.o ts_utils.o
  
  include $(top_srcdir)/src/backend/common.mk
  
--- 19,25 ----
  OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
        dict_simple.o dict_synonym.o dict_thesaurus.o \
        dict_ispell.o regis.o spell.o \
!       to_tsany.o ts_typanalyze.o ts_selfuncs.o ts_utils.o
  
  include $(top_srcdir)/src/backend/common.mk
  
*** /dev/null
--- b/src/backend/tsearch/ts_selfuncs.c
***************
*** 0 ****
--- 1,323 ----
+ /*-------------------------------------------------------------------------
+  *
+  * ts_selfuncs.c
+  *      Selectivity functions for text search types.
+  *
+  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+  *
+  *
+  * IDENTIFICATION
+  *      $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+ #include "postgres.h"
+ 
+ #include "miscadmin.h" /* for check_stack_depth() */
+ #include "utils/memutils.h"
+ #include "utils/builtins.h"
+ #include "utils/syscache.h"
+ #include "utils/lsyscache.h"
+ #include "utils/selfuncs.h"
+ #include "catalog/pg_type.h"
+ #include "catalog/pg_statistic.h"
+ #include "nodes/nodes.h"
+ #include "tsearch/ts_type.h"
+ 
+ /* lookup table type for binary searching through MCELEMs */
+ typedef struct
+ {
+       Datum   element;
+       float4  frequency;
+ } TextFreq;
+ 
+ /* type of keys for bsearch()ing through an array of TextFreqs  */
+ typedef struct
+ {
+       char    *lexeme;
+       int             length;
+ } LexemeKey;
+ 
+ static double
+ tsquerysel(VariableStatData *vardata, Datum constval);
+ static Selectivity
+ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                                                  float4 *numbers, int 
nnumbers);
+ static Selectivity
+ tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                                 int length, float4 minfreq);
+ static int
+ compare_lexeme_textfreq(const void *e1, const void *e2);
+ 
+ /*
+  *    tssel -- Selectivity of "@@"
+  */
+ Datum
+ tssel(PG_FUNCTION_ARGS)
+ {
+       PlannerInfo             *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+       /* We skip arg #2, which is the operator OID - it's gonna be "@@" */
+       List                    *args = (List *) PG_GETARG_POINTER(2);
+       int                             varRelid = PG_GETARG_INT32(3);
+       VariableStatData vardata;
+       Node                    *other;
+       bool                    varonleft;
+       Selectivity             selec;
+ 
+       /*
+        * If expression is not variable op something or something op variable,
+        * then punt and return a default estimate.
+        */
+       if (!get_restriction_variable(root, args, varRelid,
+                                                                 &vardata, 
&other, &varonleft))
+       {
+               PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+       }
+ 
+       /*
+        * Can't do anything useful if the something is not a constant, either.
+        */
+       if (!IsA(other, Const))
+       {
+               ReleaseVariableStats(vardata);
+               PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+       }
+ 
+       /* The "@@" operator is strict, so might cope with NULL right away */
+       if (((Const *) other)->constisnull) {
+               ReleaseVariableStats(vardata);
+               PG_RETURN_FLOAT8((float8) 0.0);
+       }
+ 
+       /*
+        * OK, there's a Var and a Const we're dealing with here. We need the 
Var
+        * to be a TSVector (or else we don't have any useful statistic for it).
+        */
+ 
+       if (vardata.vartype == TSVECTOROID)
+       {
+               /* tsvector @@ tsquery or the other way around */
+               Assert(((Const *) other)->consttype == TSQUERYOID);
+ 
+               selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
+       }
+       else
+       {
+               /* The Var is something we don't have useful statistic for */
+               selec = DEFAULT_TS_SEL;
+       }
+ 
+       ReleaseVariableStats(vardata);
+ 
+       PG_RETURN_FLOAT8((float8) selec);
+ }
+ 
+ static Selectivity
+ tsquerysel(VariableStatData *vardata, Datum constval)
+ {
+       Selectivity                     selec;
+ 
+       if (HeapTupleIsValid(vardata->statsTuple))
+       {
+               TSQuery                         query;
+               Form_pg_statistic       stats;
+               Datum                           *values;
+               int                                     nvalues;
+               float4                          *numbers;
+               int                                     nnumbers;
+ 
+               /* The caller made sure the const is a TSQuery, so get it now */
+               query = DatumGetTSQuery(constval);
+ 
+               stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+ 
+               /* MCELEM will be an array of Text elements for a tsvector 
column */
+               if (get_attstatsslot(vardata->statsTuple,
+                                                        TEXTOID, -1,
+                                                        STATISTIC_KIND_MCELEM, 
InvalidOid,
+                                                        &values, &nvalues,
+                                                        &numbers, &nnumbers))
+               {
+                       /*
+                        * There is a most-common-elements slot for the 
tsvector Var, so
+                        * use that.
+                        */
+ 
+                       selec = mcelem_tsquery_selec(query, values, nvalues,
+                                                                               
 numbers, nnumbers);
+                       free_attstatsslot(TEXTOID, values, nvalues, numbers, 
nnumbers);
+               }
+               else
+               {
+                       /* No most-common-elements slot */
+                       selec = (Selectivity) DEFAULT_TS_SEL;
+               }
+       }
+       else
+       {
+               selec = (Selectivity) DEFAULT_TS_SEL;
+       }
+ 
+       return selec;
+ }
+ 
+ /*
+  * Traverse the tsquery preorder, calculating selectivity as:
+  *
+  *   selec(left_oper) * selec(right_oper) in AND nodes,
+  *
+  *   selec(left_oper) + selec(right_oper) -
+  *      selec(left_oper) * selec(right_oper) in OR nodes,
+  *
+  *   1 - select(oper) in NOT nodes
+  *
+  *   freq[val] in VAL nodes, if the value is in MCELEM
+  *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
+  *
+  *
+  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
+  * binary search for determining freq[MCELEM].
+  */
+ static Selectivity
+ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                                                  float4 *numbers, int 
nnumbers)
+ {
+       float4                  minfreq;
+       TextFreq                *lookup;
+       Selectivity             selec;
+       int                             i;
+ 
+       /*
+        * Grab the lowest frequency. compute_tsvector_stats() stored it for us 
in
+        * the one before the last cell of the Numbers array. See 
ts_typanalyze.c
+        */
+       minfreq = numbers[nnumbers - 2];
+ 
+       /*
+        * Construct the array for binary search. There should be two more 
Numbers
+        * than Values, because the last two cells are taken for minimal and
+        * maximal frequency.
+        */
+       Assert(nmcelem == nnumbers - 2);
+       lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
+       for (i = 0; i < nmcelem; i++)
+       {
+               lookup[i].element = mcelem[i];
+               lookup[i].frequency = numbers[i];
+       }
+ 
+       selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
+                                                         nmcelem, minfreq);
+ 
+       pfree(lookup);
+ 
+       return selec;
+ }
+ 
+ /* TSQuery traversal function */
+ static Selectivity
+ tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                                 int length, float4 minfreq)
+ {
+       LexemeKey       key;
+       TextFreq        *searchres;
+       Selectivity     s1, s2;
+ 
+       /* since this function recurses, it could be driven to stack overflow */
+       check_stack_depth();
+ 
+       if (item->type == QI_VAL)
+       {
+               QueryOperand *oper = (QueryOperand *) item;
+ 
+               /*
+                * Prepare the key for bsearch().
+                */
+               key.lexeme = operand + oper->distance;
+               key.length = oper->length;
+ 
+               searchres = (TextFreq *) bsearch(&key, lookup, length,
+                                                                               
 sizeof(TextFreq), compare_lexeme_textfreq);
+ 
+               if (searchres)
+               {
+                       /*
+                        * The element is in MCELEM. Return precise selectivity 
(or at
+                        * least as precise, as ANALYZE could find out).
+                        */
+                       return (Selectivity) searchres->frequency;
+               }
+               else
+               {
+                       /*
+                        * The element is not in MCELEM. Punt, but assert that 
the
+                        * selectivity cannot be more than minfreq / 2.
+                        */
+                       return (Selectivity) Min(DEFAULT_TS_SEL, minfreq / 2);
+               }
+       }
+ 
+       /* Current TSQuery node is an operator */
+       switch (item->operator.oper)
+       {
+               case OP_NOT:
+                       return 1.0 - tsquery_opr_selec(item + 1, operand, 
lookup,
+                                                                               
   length, minfreq);
+ 
+               case OP_AND:
+                       return
+                               tsquery_opr_selec(item + 1, operand, lookup, 
length, minfreq) *
+                               tsquery_opr_selec(item + item->operator.left, 
operand, lookup,
+                                                                 length, 
minfreq);
+ 
+               case OP_OR:
+                       s1 = tsquery_opr_selec(item + 1, operand, lookup, 
length, minfreq);
+                       s2 = tsquery_opr_selec(item + item->operator.left, 
operand, lookup,
+                                                                  length, 
minfreq);
+                       return s1 + s2 - s1 * s2;
+ 
+               default:
+                       elog(ERROR, "unrecognized operator: %d", 
item->operator.oper);
+       }
+ 
+       /* never reached, keep compiler happy */
+       return (Selectivity) DEFAULT_TS_SEL;
+ }
+ 
+ /*
+  * bsearch() comparator for a lexeme (non-NULL terminated string with length)
+  * and a TextFreq. Use length, then byte-for-byte comparision, because that's
+  * how ANALYZE code sorted data before storing it in a statistic tuple.
+  * See ts_typanalyze.c for details.
+  */
+ static int
+ compare_lexeme_textfreq(const void *e1, const void *e2)
+ {
+       const LexemeKey *key;
+       const TextFreq  *t;
+       text                    *element;
+       int                             len1,
+                                       len2;
+ 
+       key = (const LexemeKey *) e1;
+       t = (const TextFreq *) e2;
+ 
+       /*
+        * The text Datum came from an array, so it cannot be compressed
+        * or stored out-of-line -- it's safe to use VARSIZE_ANY*.
+        */
+       Assert(!VARATT_IS_COMPRESSED(t->element) && 
!VARATT_IS_EXTERNAL(t->element));
+ 
+       len1 = key->length;
+       len2 = VARSIZE_ANY_EXHDR(t->element);
+ 
+       /* Compare lengths first, possibly avoiding a strncmp call */
+       if (len1 > len2)
+               return 1;
+       else if (len1 < len2)
+               return -1;
+ 
+       /* Fall back on byte-for-byte comparision */
+       element = DatumGetTextP(t->element);
+       return strncmp(key->lexeme, VARDATA_ANY(element), len1);
+ }
*** a/src/backend/tsearch/ts_typanalyze.c
--- b/src/backend/tsearch/ts_typanalyze.c
***************
*** 43,50 **** static void compute_tsvector_stats(VacAttrStats *stats,
  static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current);
  static uint32 lexeme_hash(const void *key, Size keysize);
  static int lexeme_match(const void *key1, const void *key2, Size keysize);
! static int trackitem_compare_desc(const void *e1, const void *e2);
! 
  
  /*
   *    ts_typanalyze -- a custom typanalyze function for tsvector columns
--- 43,51 ----
  static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current);
  static uint32 lexeme_hash(const void *key, Size keysize);
  static int lexeme_match(const void *key1, const void *key2, Size keysize);
! static int lexeme_compare(const void *key1, const void *key2);
! static int trackitem_compare_frequencies_desc(const void *e1, const void *e2);
! static int trackitem_compare_lexemes(const void *e1, const void *e2);
  
  /*
   *    ts_typanalyze -- a custom typanalyze function for tsvector columns
***************
*** 247,252 **** compute_tsvector_stats(VacAttrStats *stats,
--- 248,254 ----
                int                     i;
                TrackItem       **sort_table;
                int                     track_len;
+               int                     minfreq, maxfreq;
  
                stats->stats_valid = true;
                /* Do the simple null-frac and average width stats */
***************
*** 273,279 **** compute_tsvector_stats(VacAttrStats *stats,
                Assert(i == track_len);
  
                qsort(sort_table, track_len, sizeof(TrackItem *),
!                         trackitem_compare_desc);
  
                /* Suppress any single-occurrence items */
                while (track_len > 0)
--- 275,281 ----
                Assert(i == track_len);
  
                qsort(sort_table, track_len, sizeof(TrackItem *),
!                         trackitem_compare_frequencies_desc);
  
                /* Suppress any single-occurrence items */
                while (track_len > 0)
***************
*** 287,292 **** compute_tsvector_stats(VacAttrStats *stats,
--- 289,314 ----
                if (num_mcelem > track_len)
                        num_mcelem = track_len;
  
+               /* Grab the minimal and maximal frequencies that will get 
stored */
+               minfreq = sort_table[num_mcelem]->frequency;
+               maxfreq = sort_table[0]->frequency;
+ 
+               /*
+                * We want to store statistics sorted on the lexeme value using 
first
+                * length, then byte-for-byte comparision. The reason for doing 
length
+                * comparision first is that we don't care about the ordering 
as long
+                * as it's consistent and comparing lengths first gives us a 
chance to
+                * avoid a strncmp() call.
+                *
+                * This is different from what we do with scalar statistics -- 
they get
+                * sorted on frequencies. The rationale is that we usually 
search
+                * through most common elements looking for a specific value, 
so we can
+                * grab its frequency. When values are presorted we can employ 
binary
+                * search for that. See ts_selfuncs.c for a real usage scenario.
+                */
+               qsort(sort_table, num_mcelem, sizeof(TrackItem *),
+                         trackitem_compare_lexemes);
+ 
                /* Generate MCELEM slot entry */
                if (num_mcelem > 0)
                {
***************
*** 296,303 **** compute_tsvector_stats(VacAttrStats *stats,
  
                        /* Must copy the target values into anl_context */
                        old_context = MemoryContextSwitchTo(stats->anl_context);
                        mcelem_values = (Datum *) palloc(num_mcelem * 
sizeof(Datum));
!                       mcelem_freqs = (float4 *) palloc(num_mcelem * 
sizeof(float4));
  
                        for (i = 0; i < num_mcelem; i++)
                        {
--- 318,332 ----
  
                        /* Must copy the target values into anl_context */
                        old_context = MemoryContextSwitchTo(stats->anl_context);
+ 
+                       /*
+                        * We sorted statistics on the lexeme value, but we 
want to be
+                        * able to reach the minimal and maximal frequency 
without goind
+                        * through all the values. We keep those two extra 
frequencies in
+                        * two extra cells in mcelem_freqs.
+                        */
                        mcelem_values = (Datum *) palloc(num_mcelem * 
sizeof(Datum));
!                       mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * 
sizeof(float4));
  
                        for (i = 0; i < num_mcelem; i++)
                        {
***************
*** 306,319 **** compute_tsvector_stats(VacAttrStats *stats,
                                mcelem_values[i] =
                                        
PointerGetDatum(cstring_to_text_with_len(item->key.lexeme,
                                                                                
                                         item->key.length));
!                               mcelem_freqs[i] = (double) item->frequency / 
(double) nonnull_cnt;
                        }
                        MemoryContextSwitchTo(old_context);
  
                        stats->stakind[0] = STATISTIC_KIND_MCELEM;
                        stats->staop[0] = TextEqualOperator;
                        stats->stanumbers[0] = mcelem_freqs;
!                       stats->numnumbers[0] = num_mcelem;
                        stats->stavalues[0] = mcelem_values;
                        stats->numvalues[0] = num_mcelem;
                        /* We are storing text values */
--- 335,353 ----
                                mcelem_values[i] =
                                        
PointerGetDatum(cstring_to_text_with_len(item->key.lexeme,
                                                                                
                                         item->key.length));
!                               mcelem_freqs[i] =
!                                       (double) item->frequency /
!                                       (double) nonnull_cnt;
                        }
+                       mcelem_freqs[i++] = (double) minfreq / (double) 
nonnull_cnt;
+                       mcelem_freqs[i] = (double) maxfreq / (double) 
nonnull_cnt;
                        MemoryContextSwitchTo(old_context);
  
                        stats->stakind[0] = STATISTIC_KIND_MCELEM;
                        stats->staop[0] = TextEqualOperator;
                        stats->stanumbers[0] = mcelem_freqs;
!                       /* See above comment about two extra frequency fields */
!                       stats->numnumbers[0] = num_mcelem + 2;
                        stats->stavalues[0] = mcelem_values;
                        stats->numvalues[0] = num_mcelem;
                        /* We are storing text values */
***************
*** 379,403 **** lexeme_hash(const void *key, Size keysize)
  static int
  lexeme_match(const void *key1, const void *key2, Size keysize)
  {
!       const LexemeHashKey *d1 = (const LexemeHashKey *) key1;
!       const LexemeHashKey *d2 = (const LexemeHashKey *) key2;
  
!       /* The lexemes need to have the same length, and be memcmp-equal */
!       if (d1->length == d2->length &&
!               memcmp(d1->lexeme, d2->lexeme, d1->length) == 0)
!               return 0;
!       else
                return 1;
  }
  
  /*
!  *    qsort() comparator for TrackItems - LC style (descending sort)
   */
  static int
! trackitem_compare_desc(const void *e1, const void *e2)
  {
        const TrackItem * const *t1 = (const TrackItem * const *) e1;
        const TrackItem * const *t2 = (const TrackItem * const *) e2;
  
        return (*t2)->frequency - (*t1)->frequency;
  }
--- 413,461 ----
  static int
  lexeme_match(const void *key1, const void *key2, Size keysize)
  {
!       /* The keysize parameter is superfluous, the keys store their lengths */
!       return lexeme_compare(key1, key2);
! }
  
! /*
!  *    Comparision function for lexemes.
!  */
! static int
! lexeme_compare(const void *key1, const void *key2)
! {
!       const LexemeHashKey     *d1 = (const LexemeHashKey *) key1;
!       const LexemeHashKey     *d2 = (const LexemeHashKey *) key2;
! 
!       /* First, compare by length */
!       if (d1->length > d2->length)
                return 1;
+       else if (d1->length < d2->length)
+               return -1;
+       else
+               /* Lengths are equal, do a byte-for-byte comparision */
+               return strncmp(d1->lexeme, d2->lexeme, d1->length);
  }
  
  /*
!  *    qsort() comparator for sorting TrackItems on frequencies (descending 
sort)
   */
  static int
! trackitem_compare_frequencies_desc(const void *e1, const void *e2)
  {
        const TrackItem * const *t1 = (const TrackItem * const *) e1;
        const TrackItem * const *t2 = (const TrackItem * const *) e2;
  
        return (*t2)->frequency - (*t1)->frequency;
  }
+ 
+ /*
+  *    qsort() comparator for sorting TrackItem on lexemes
+  */
+ static int
+ trackitem_compare_lexemes(const void *e1, const void *e2)
+ {
+       const TrackItem * const *t1 = (const TrackItem * const *) e1;
+       const TrackItem * const *t2 = (const TrackItem * const *) e2;
+ 
+       return lexeme_compare(&(*t1)->key, &(*t2)->key);
+ }
*** a/src/include/catalog/pg_operator.h
--- b/src/include/catalog/pg_operator.h
***************
*** 915,924 **** DATA(insert OID = 3630 (  "<>"    PGNSP PGUID b f f 3614       
 3614    16 3630 3629    ts
  DATA(insert OID = 3631 (  ">="           PGNSP PGUID b f f 3614        3614   
 16 3628 3627    tsvector_ge scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3632 (  ">"    PGNSP PGUID b f f 3614        3614    16 
3627 3628    tsvector_gt scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3633 (  "||"           PGNSP PGUID b f f 3614        3614   
 3614  0        0        tsvector_concat   -    -         ));
! DATA(insert OID = 3636 (  "@@"           PGNSP PGUID b f f 3614        3615   
 16 3637        0        ts_match_vq   contsel     contjoinsel   ));
! DATA(insert OID = 3637 (  "@@"           PGNSP PGUID b f f 3615        3614   
 16 3636        0        ts_match_qv   contsel     contjoinsel   ));
! DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614      3615    16 
3661        0        ts_match_vq   contsel     contjoinsel   ));
! DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615      3614    16 
3660        0        ts_match_qv   contsel     contjoinsel   ));
  DATA(insert OID = 3674 (  "<"    PGNSP PGUID b f f 3615        3615    16 
3679 3678    tsquery_lt scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3675 (  "<="           PGNSP PGUID b f f 3615        3615   
 16 3678 3679    tsquery_le scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3676 (  "="    PGNSP PGUID b t f 3615        3615    16 
3676 3677    tsquery_eq eqsel eqjoinsel ));
--- 915,924 ----
  DATA(insert OID = 3631 (  ">="           PGNSP PGUID b f f 3614        3614   
 16 3628 3627    tsvector_ge scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3632 (  ">"    PGNSP PGUID b f f 3614        3614    16 
3627 3628    tsvector_gt scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3633 (  "||"           PGNSP PGUID b f f 3614        3614   
 3614  0        0        tsvector_concat   -    -         ));
! DATA(insert OID = 3636 (  "@@"           PGNSP PGUID b f f 3614        3615   
 16 3637        0        ts_match_vq   tssel       contjoinsel   ));
! DATA(insert OID = 3637 (  "@@"           PGNSP PGUID b f f 3615        3614   
 16 3636        0        ts_match_qv   tssel       contjoinsel   ));
! DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614      3615    16 
3661        0        ts_match_vq   tssel       contjoinsel   ));
! DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615      3614    16 
3660        0        ts_match_qv   tssel       contjoinsel   ));
  DATA(insert OID = 3674 (  "<"    PGNSP PGUID b f f 3615        3615    16 
3679 3678    tsquery_lt scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3675 (  "<="           PGNSP PGUID b f f 3615        3615   
 16 3678 3679    tsquery_le scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3676 (  "="    PGNSP PGUID b t f 3615        3615    16 
3676 3677    tsquery_eq eqsel eqjoinsel ));
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
***************
*** 4434,4439 **** DESCR("GiST tsquery support");
--- 4434,4441 ----
  DATA(insert OID = 3701 (  gtsquery_consistent                 PGNSP PGUID 12 
1 0 0 f f t f i 5 16 "2281 2281 23 26 2281" _null_ _null_ _null_ 
gtsquery_consistent _null_ _null_ _null_ ));
  DESCR("GiST tsquery support");
  
+ DATA(insert OID = 3687 (  tssel       PGNSP PGUID 12 1 0 0 f f t f s 4 701 
"2281 26 2281 23" _null_ _null_ _null_ tssel _null_ _null_ _null_ ));
+ DESCR("restriction selectivity of tsvector @@ tsquery");
  DATA(insert OID = 3688 (  ts_typanalyze       PGNSP PGUID 12 1 0 0 f f t f s 
1 16 "2281" _null_ _null_ _null_ ts_typanalyze _null_ _null_ _null_ ));
  DESCR("tsvector typanalyze");
  
*** a/src/include/tsearch/ts_type.h
--- b/src/include/tsearch/ts_type.h
***************
*** 155,160 **** extern Datum ts_rankcd_wttf(PG_FUNCTION_ARGS);
--- 155,162 ----
  
  extern Datum ts_typanalyze(PG_FUNCTION_ARGS);
  
+ extern Datum tssel(PG_FUNCTION_ARGS);
+ 
  
  /*
   * TSQuery
***************
*** 291,294 **** extern Datum tsquery_rewrite_query(PG_FUNCTION_ARGS);
--- 293,309 ----
  extern Datum tsq_mcontains(PG_FUNCTION_ARGS);
  extern Datum tsq_mcontained(PG_FUNCTION_ARGS);
  
+ /*
+  * The default text search selectivity is chosen to be smll enough to
+  * encourage indexscans for typical table densities. See selfuncs.h and
+  * DEFAULT_EQ_SEL for details.
+  */
+ #define DEFAULT_TS_SEL 0.005
+ 
+ /*
+  * operator restriction function for tsvector @@ tsquery and
+  * tsquery @@ tsvector
+  */
+ extern Datum tssel(PG_FUNCTION_ARGS);
+ 
  #endif   /* _PG_TSTYPE_H_ */

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] gsoc, oprrest function for text search take 2

Reply via email to