Hi all,

Here is my preliminary work on porting pg_trgm to GIN. pg_trgm can be
a very good addition to tsearch2 to suggest spellings for mispelled
words as explained in the README.pg_trgm file and I'd like to use it
in this case. GIST implementation is a bit slow so I tried to port it
to use GIN.

The attached patch is the first working implementation. It's not final
but I would like some feedback on how to fix the remaining problems.

From a previous discussion with Teodor, it would be better to store an
int in the index instead of a text (it takes less space and is
faster). I couldn't find any example so if anyone has an advice to fix
that, it's welcome (mostly how to pack the trigram into an int instead
of a text).

The last problem is that similarity calculated in the GIN index is
higher than the one with GIST so I have to set the trgm_limit quite
high to have decent results (a limit of 0.8 instead of 0.3 seems to be
quite good).
AFAICS, it comes from the fact that I couldn't find any way to get the
length of the indexed trigram which is taken into account with GIST so
we're not exactly filtering the results in the same way.
Does anyone have an idea on how to fix this point?

Thanks for your attention.

--
Guillaume
Index: Makefile
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/pg_trgm/Makefile,v
retrieving revision 1.6
diff -c -r1.6 Makefile
*** Makefile    9 Feb 2007 17:24:33 -0000       1.6
--- Makefile    21 Feb 2007 23:49:37 -0000
***************
*** 1,7 ****
  # $PostgreSQL: pgsql/contrib/pg_trgm/Makefile,v 1.6 2007/02/09 17:24:33 
petere Exp $
  
  MODULE_big = pg_trgm
! OBJS = trgm_op.o trgm_gist.o 
  
  DATA_built = pg_trgm.sql
  DATA = uninstall_pg_trgm.sql
--- 1,7 ----
  # $PostgreSQL: pgsql/contrib/pg_trgm/Makefile,v 1.6 2007/02/09 17:24:33 
petere Exp $
  
  MODULE_big = pg_trgm
! OBJS = trgm_op.o trgm_gist.o trgm_gin.o
  
  DATA_built = pg_trgm.sql
  DATA = uninstall_pg_trgm.sql
Index: uninstall_pg_trgm.sql
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/pg_trgm/uninstall_pg_trgm.sql,v
retrieving revision 1.2
diff -c -r1.2 uninstall_pg_trgm.sql
*** uninstall_pg_trgm.sql       13 Mar 2006 18:04:58 -0000      1.2
--- uninstall_pg_trgm.sql       21 Feb 2007 23:49:37 -0000
***************
*** 20,25 ****
--- 20,33 ----
  
  DROP TYPE gtrgm CASCADE;
  
+ DROP OPERATOR CLASS gin_trgm_ops USING gin;
+ 
+ DROP FUNCTION gin_extract_trgm(text, internal);
+ 
+ DROP FUNCTION gin_extract_trgm(text, internal, internal);
+ 
+ DROP FUNCTION gin_trgm_consistent(internal, internal, text);
+ 
  DROP OPERATOR % (text, text);
  
  DROP FUNCTION similarity_op(text,text);
Index: pg_trgm.sql.in
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/pg_trgm/pg_trgm.sql.in,v
retrieving revision 1.2
diff -c -r1.2 pg_trgm.sql.in
*** pg_trgm.sql.in      27 Feb 2006 16:09:48 -0000      1.2
--- pg_trgm.sql.in      21 Feb 2007 23:49:37 -0000
***************
*** 36,42 ****
          JOIN = contjoinsel
  );
  
! --gist key
  CREATE FUNCTION gtrgm_in(cstring)
  RETURNS gtrgm
  AS 'MODULE_PATHNAME'
--- 36,42 ----
          JOIN = contjoinsel
  );
  
! -- gist key
  CREATE FUNCTION gtrgm_in(cstring)
  RETURNS gtrgm
  AS 'MODULE_PATHNAME'
***************
*** 53,59 ****
          OUTPUT = gtrgm_out
  );
  
! -- support functions
  CREATE FUNCTION gtrgm_consistent(gtrgm,internal,int4)
  RETURNS bool
  AS 'MODULE_PATHNAME'
--- 53,59 ----
          OUTPUT = gtrgm_out
  );
  
! -- support functions for gist
  CREATE FUNCTION gtrgm_consistent(gtrgm,internal,int4)
  RETURNS bool
  AS 'MODULE_PATHNAME'
***************
*** 89,95 ****
  AS 'MODULE_PATHNAME'
  LANGUAGE C;
  
! -- create the operator class
  CREATE OPERATOR CLASS gist_trgm_ops
  FOR TYPE text USING gist
  AS
--- 89,95 ----
  AS 'MODULE_PATHNAME'
  LANGUAGE C;
  
! -- create the operator class for gist
  CREATE OPERATOR CLASS gist_trgm_ops
  FOR TYPE text USING gist
  AS
***************
*** 103,107 ****
--- 103,133 ----
          FUNCTION        7       gtrgm_same (gtrgm, gtrgm, internal),
          STORAGE         gtrgm;
  
+ -- support functions for gin
+ CREATE FUNCTION gin_extract_trgm(text, internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE C;
+ 
+ CREATE FUNCTION gin_extract_trgm(text, internal, internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE C;
+ 
+ CREATE FUNCTION gin_trgm_consistent(internal, internal, text)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE C;
+ 
+ -- create the operator class for gin
+ CREATE OPERATOR CLASS gin_trgm_ops
+ FOR TYPE text USING gin
+ AS
+         OPERATOR        1       % (text, text),
+         FUNCTION        1       bttextcmp (text, text),
+         FUNCTION        2       gin_extract_trgm (text, internal),
+         FUNCTION        3       gin_extract_trgm (text, internal, internal),
+         FUNCTION        4       gin_trgm_consistent (internal, internal, 
text),
+         STORAGE         text;
  
  COMMIT;
Index: contrib/pg_trgm/trgm_gin.c
===================================================================
RCS file: contrib/pg_trgm/trgm_gin.c
diff -N contrib/pg_trgm/trgm_gin.c
*** /dev/null   1 Jan 1970 00:00:00 -0000
--- contrib/pg_trgm/trgm_gin.c  1 Jan 1970 00:00:00 -0000
***************
*** 0 ****
--- 1,78 ----
+ #include "trgm.h"
+ 
+ #include "access/gin.h"
+ #include "access/itup.h"
+ #include "access/tuptoaster.h"
+ #include "storage/bufpage.h"
+ #include "utils/array.h"
+ #include "utils/builtins.h"
+ 
+ PG_FUNCTION_INFO_V1(gin_extract_trgm);
+ Datum         gin_extract_trgm(PG_FUNCTION_ARGS);
+ 
+ PG_FUNCTION_INFO_V1(gin_trgm_consistent);
+ Datum         gin_trgm_consistent(PG_FUNCTION_ARGS);
+ 
+ Datum
+ gin_extract_trgm(PG_FUNCTION_ARGS)
+ {
+       text            *val = (text *) PG_GETARG_TEXT_P(0);
+       int32           *nentries = (int32 *) PG_GETARG_POINTER(1);
+       Datum           *entries = NULL;
+       TRGM            *trg;
+       int4            trglen;
+       
+       *nentries = 0;
+       
+       trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
+       trglen = ARRNELEM(trg);
+       
+       if (trglen > 0)
+       {
+               trgm *ptr;
+               int4 i = 0;
+               *nentries = (int32) trglen;
+               entries = (Datum *) palloc(sizeof(Datum) * trglen);
+ 
+               ptr = GETARR(trg);
+               while (ptr - GETARR(trg) < ARRNELEM(trg))
+               {
+                       text *item = (text *) palloc(VARHDRSZ + 3);
+       
+                       VARATT_SIZEP(item) = VARHDRSZ + 3;
+                       CPTRGM(VARDATA(item), ptr);
+                       entries[i++] = PointerGetDatum(item);
+                       
+                       ptr++;
+               }
+       }
+ 
+       PG_RETURN_POINTER(entries);
+ }
+ 
+ Datum
+ gin_trgm_consistent(PG_FUNCTION_ARGS)
+ {
+       bool            *check = (bool *) PG_GETARG_POINTER(0);
+       text            *query = (text *) PG_GETARG_TEXT_P(2);
+       bool            res = FALSE;
+       TRGM            *trg;
+       int                     i,
+                               trglen,
+                               ntrue = 0;
+       
+       trg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
+       trglen = ARRNELEM(trg);
+       
+       for (i = 0; i < trglen; i++)
+               if (check[i])
+                       ntrue ++;
+ 
+ #ifdef DIVUNION
+       res = (trglen == ntrue) ? true : ((((((float4) ntrue) / ((float4) 
(trglen - ntrue)))) >= trgm_limit) ? true : false);
+ #else
+       res = (trglen == 0) ? false : ((((((float4) ntrue) / ((float4) 
trglen))) >= trgm_limit) ? true : false);
+ #endif
+ 
+       PG_RETURN_BOOL(res);
+ }
---------------------------(end of broadcast)---------------------------
TIP 4: Have you searched our list archives?

               http://archives.postgresql.org

Reply via email to