Jan Urbański wrote:
> Great, I didn't know the API was that convenient in 8.3. I'll try
> posting a working patch for 8.3 during the weekend.

Here's the patch for 8.3beta2. As was suggested I added a configuration
parameter to the 'simple' dictionary called AcceptAll so now it can work
in two modes: either accept everything (the default) or do not
reckognize anything (return NULL). Of couse stopwords are still being
weeded out.

The patch includes changes to the documentation (which was inconsistent
by the way: it stated that the 'simple' dictionary returns NULL for
stopwords, when in fact it returns an empty array).

Regards,
Jan Urbanski
-- 
Jan Urbanski
GPG key ID: E583D7D2

ouden estin
diff -Naur postgresql-8.3beta2-orig/doc/src/sgml/textsearch.sgml 
postgresql-8.3beta2/doc/src/sgml/textsearch.sgml
--- postgresql-8.3beta2-orig/doc/src/sgml/textsearch.sgml       2007-10-27 
02:19:45.000000000 +0200
+++ postgresql-8.3beta2/doc/src/sgml/textsearch.sgml    2007-11-14 
03:35:48.000000000 +0100
@@ -2090,9 +2090,10 @@
    <para>
     The <literal>simple</> dictionary template operates by converting the
     input token to lower case and checking it against a file of stop words.
-    If it is found in the file then <literal>NULL</> is returned, causing
-    the token to be discarded.  If not, the lower-cased form of the word
-    is returned as the normalized lexeme.
+    If it is found in the file then an empty array is returned. If not, the
+    return value depends on the configuration. The default is to return the
+    lower-cased form of the word, but one might choose to
+    return <literal>NULL</> insead.
    </para>
 
    <para>
@@ -2135,6 +2136,34 @@
 </programlisting>
    </para>
 
+   <para>
+     We can also choose to return <literal>NULL</> insead of the lower-cased
+     lexeme if it is not found in the stop words file. This can be useful if
+     we just want to pass the unchanged lexeme to another dictionary instead
+     of reporting it as reckognized. We can control this behaviour through
+     the <literal>AcceptAll</> parameter. Correct values for this parameter
+     are <literal>true</> and <literal>false</>, the default
+     is <literal>true</>.
+   </para>
+
+   <para>
+     Using the same configuration as in the previous example:
+
+<programlisting>
+ALTER TEXT SEARCH DICTIONARY public.simple_dict ( AcceptAll = false );
+
+SELECT ts_lexize('public.simple_dict','YeS');
+ ts_lexize
+-----------
+
+
+SELECT ts_lexize('public.simple_dict','The');
+ ts_lexize
+-----------
+ {}
+</programlisting>
+   </para>
+
    <caution>
     <para>
      Most types of dictionaries rely on configuration files, such as files of
diff -Naur postgresql-8.3beta2-orig/src/backend/tsearch/dict_simple.c 
postgresql-8.3beta2/src/backend/tsearch/dict_simple.c
--- postgresql-8.3beta2-orig/src/backend/tsearch/dict_simple.c  2007-08-25 
02:03:59.000000000 +0200
+++ postgresql-8.3beta2/src/backend/tsearch/dict_simple.c       2007-11-14 
03:39:45.000000000 +0100
@@ -23,6 +23,7 @@
 typedef struct
 {
        StopList        stoplist;
+       bool            acceptAll;
 } DictSimple;
 
 
@@ -31,8 +32,12 @@
 {
        List       *dictoptions = (List *) PG_GETARG_POINTER(0);
        DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
-       bool            stoploaded = false;
+       bool            stoploaded = false,
+                       acceptloaded = false;
        ListCell   *l;
+       const char      *defstring;
+
+       d->acceptAll = true;
 
        foreach(l, dictoptions)
        {
@@ -47,6 +52,24 @@
                        readstoplist(defGetString(defel), &d->stoplist, 
lowerstr);
                        stoploaded = true;
                }
+               else if (pg_strcasecmp("AcceptAll", defel->defname) == 0)
+               {
+                       if (acceptloaded)
+                               ereport(ERROR,
+                                       
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("multiple AcceptAll 
parameters")));
+                       defstring = defGetString(defel);
+                       if (pg_strcasecmp(defstring, "True") == 0)
+                               d->acceptAll = true;
+                       else if (pg_strcasecmp(defstring, "False") == 0)
+                               d->acceptAll = false;
+                       else
+                               ereport(ERROR,
+                                       
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("invalid value for AcceptAll 
parameter: \"%s\"",
+                                                       defstring)));
+                       acceptloaded = true;
+               }
                else
                {
                        ereport(ERROR,
@@ -71,9 +94,18 @@
        txt = lowerstr_with_len(in, len);
 
        if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
+       {
                pfree(txt);
+               PG_RETURN_POINTER(res);
+       }
        else
-               res[0].lexeme = txt;
-
-       PG_RETURN_POINTER(res);
+       {
+               if (d->acceptAll)
+               {
+                       res[0].lexeme = txt;
+                       PG_RETURN_POINTER(res);
+               }
+               else
+                       PG_RETURN_POINTER(NULL);
+       }
 }

Attachment: signature.asc
Description: OpenPGP digital signature

Reply via email to