Hi!
On Mon, Jan 31, 2011 at 12:52 AM, Jan Urbański <[email protected]> wrote:
> I saw that the code tries to handle ILIKE searches, but apparently it's
> failing somewhere.
>
It was just a typo. Corrected version attached.
----
With best regards,
Alexander Korotkov.
*** a/contrib/pg_trgm/pg_trgm.sql.in
--- b/contrib/pg_trgm/pg_trgm.sql.in
***************
*** 113,118 **** FOR TYPE text USING gist
--- 113,120 ----
AS
OPERATOR 1 % (text, text),
OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
+ OPERATOR 3 ~~ (text, text),
+ OPERATOR 4 ~~* (text, text),
FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal),
FUNCTION 2 gtrgm_union (bytea, internal),
FUNCTION 3 gtrgm_compress (internal),
***************
*** 129,140 **** RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
! CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal, int2, internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
! CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
--- 131,142 ----
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
! CREATE OR REPLACE FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
! CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
***************
*** 144,151 **** CREATE OPERATOR CLASS gin_trgm_ops
FOR TYPE text USING gin
AS
OPERATOR 1 % (text, text),
FUNCTION 1 btint4cmp (int4, int4),
FUNCTION 2 gin_extract_trgm (text, internal),
! FUNCTION 3 gin_extract_trgm (text, internal, int2, internal, internal),
! FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal),
STORAGE int4;
--- 146,155 ----
FOR TYPE text USING gin
AS
OPERATOR 1 % (text, text),
+ OPERATOR 3 ~~ (text, text),
+ OPERATOR 4 ~~* (text, text),
FUNCTION 1 btint4cmp (int4, int4),
FUNCTION 2 gin_extract_trgm (text, internal),
! FUNCTION 3 gin_extract_query_trgm (text, internal, int2, internal, internal, internal, internal),
! FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal, internal, internal),
STORAGE int4;
*** a/contrib/pg_trgm/trgm.h
--- b/contrib/pg_trgm/trgm.h
***************
*** 13,24 ****
--- 13,32 ----
#define LPADDING 2
#define RPADDING 1
#define KEEPONLYALNUM
+ /*
+ * IGNORECASE macro means that trigrams is case-insensetive. If this macro is
+ * disabled, then ~~* operator should be excluded from operator class, because
+ * we can't handle case-insensetive wildcard search with case-sensetive
+ * trigrams.
+ */
#define IGNORECASE
#define DIVUNION
/* operator strategy numbers */
#define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2
+ #define LikeStrategyNumber 3
+ #define ILikeStrategyNumber 4
typedef char trgm[3];
***************
*** 53,59 **** typedef struct
/* gist */
#define BITBYTE 8
! #define SIGLENINT 3 /* >122 => key will toast, so very slow!!! */
#define SIGLEN ( sizeof(int)*SIGLENINT )
#define SIGLENBIT (SIGLEN*BITBYTE - 1) /* see makesign */
--- 61,67 ----
/* gist */
#define BITBYTE 8
! #define SIGLENINT 15 /* >122 => key will toast, so very slow!!! */
#define SIGLEN ( sizeof(int)*SIGLENINT )
#define SIGLENBIT (SIGLEN*BITBYTE - 1) /* see makesign */
***************
*** 89,94 **** typedef char *BITVECP;
--- 97,107 ----
extern float4 trgm_limit;
TRGM *generate_trgm(char *str, int slen);
+ TRGM *generate_wildcard_trgm(char *str, int slen);
float4 cnt_sml(TRGM *trg1, TRGM *trg2);
+ bool trgm_contain(TRGM *trg1, TRGM *trg2);
+
+ #define ISESCAPECHAR(x) (*(x) == '\\') /* Wildcard escape character */
+ #define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%') /* Wildcard meta-character */
#endif /* __TRGM_H__ */
*** a/contrib/pg_trgm/trgm_gin.c
--- b/contrib/pg_trgm/trgm_gin.c
***************
*** 6,11 ****
--- 6,12 ----
#include "trgm.h"
#include "access/gin.h"
+ #include "access/skey.h"
#include "access/itup.h"
#include "access/tuptoaster.h"
#include "storage/bufpage.h"
***************
*** 16,21 ****
--- 17,25 ----
PG_FUNCTION_INFO_V1(gin_extract_trgm);
Datum gin_extract_trgm(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(gin_extract_query_trgm);
+ Datum gin_extract_query_trgm(PG_FUNCTION_ARGS);
+
PG_FUNCTION_INFO_V1(gin_trgm_consistent);
Datum gin_trgm_consistent(PG_FUNCTION_ARGS);
***************
*** 58,90 **** gin_extract_trgm(PG_FUNCTION_ARGS)
}
Datum
gin_trgm_consistent(PG_FUNCTION_ARGS)
{
bool *check = (bool *) PG_GETARG_POINTER(0);
! /* StrategyNumber strategy = PG_GETARG_UINT16(1); */
/* text *query = PG_GETARG_TEXT_P(2); */
! int32 nkeys = PG_GETARG_INT32(3);
! /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
bool *recheck = (bool *) PG_GETARG_POINTER(5);
bool res = FALSE;
int32 i,
! ntrue = 0;
/* All cases served by this function are inexact */
*recheck = true;
! /* Count the matches */
! for (i = 0; i < nkeys; i++)
{
! if (check[i])
! ntrue++;
! }
#ifdef DIVUNION
! res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false);
#else
! res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false);
#endif
!
PG_RETURN_BOOL(res);
}
--- 62,182 ----
}
Datum
+ gin_extract_query_trgm(PG_FUNCTION_ARGS)
+ {
+ text *val = (text *) PG_GETARG_TEXT_P(0);
+ int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
+ StrategyNumber strategy = PG_GETARG_UINT16(2);
+ Datum *entries = NULL;
+ TRGM *trg;
+ int4 trglen;
+ int32 **extra_data = (int32 **) PG_GETARG_POINTER(4);
+ int32 *searchMode = (int32 *)PG_GETARG_POINTER(6);
+ trgm *ptr;
+ int4 i = 0,
+ item;
+
+ switch (strategy)
+ {
+ case SimilarityStrategyNumber:
+ trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
+ break;
+ case LikeStrategyNumber:
+ case ILikeStrategyNumber:
+ /*
+ * For wildcard search we should extract all the trigrams, which
+ * every wildcard conforming string should include.
+ */
+ trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ trg = NULL; /* keep compiler quiet */
+ break;
+ }
+ trglen = ARRNELEM(trg);
+
+ *nentries = (int32) trglen;
+
+ if (trglen > 0)
+ {
+ entries = (Datum *) palloc(sizeof(Datum) * trglen);
+ ptr = GETARR(trg);
+ while (ptr - GETARR(trg) < ARRNELEM(trg))
+ {
+ item = trgm2int(ptr);
+ entries[i++] = Int32GetDatum(item);
+
+ ptr++;
+ }
+ }
+
+ *extra_data = (int32 *) palloc0(sizeof(int32));
+ **extra_data = trglen;
+
+ /*
+ * If no trigrams was extracted then we have to scan all the index.
+ */
+ if (trglen == 0)
+ *searchMode = GIN_SEARCH_MODE_ALL;
+
+ PG_RETURN_POINTER(entries);
+ }
+
+ Datum
gin_trgm_consistent(PG_FUNCTION_ARGS)
{
bool *check = (bool *) PG_GETARG_POINTER(0);
! StrategyNumber strategy = PG_GETARG_UINT16(1);
/* text *query = PG_GETARG_TEXT_P(2); */
! /* int32 nkeys = PG_GETARG_INT32(3); */
! int32 *extra_data = (int32 *) PG_GETARG_POINTER(4);
bool *recheck = (bool *) PG_GETARG_POINTER(5);
bool res = FALSE;
int32 i,
! ntrue = 0,
! trglen;
+ #ifndef IGNORECASE
+ if (strategy == ILIKE_STRATEGY)
+ {
+ elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams.");
+ }
+ #endif
/* All cases served by this function are inexact */
*recheck = true;
! trglen = *extra_data;
!
! switch (strategy)
{
! case SimilarityStrategyNumber:
! /* Count the matches */
! for (i = 0; i < trglen; i++)
! if (check[i])
! ntrue++;
#ifdef DIVUNION
! res = (trglen == ntrue) ? true : ((((((float4) ntrue) / ((float4) (trglen - ntrue)))) >= trgm_limit) ? true : false);
#else
! res = (trglen == 0) ? false : ((((((float4) ntrue) / ((float4) trglen))) >= trgm_limit) ? true : false);
#endif
! break;
! case LikeStrategyNumber:
! case ILikeStrategyNumber:
! /* Check if all extracted trigrams are presented. */
! res = true;
! for (i = 0; i < trglen; i++)
! if (!check[i])
! {
! res = false;
! break;
! }
! break;
! default:
! elog(ERROR, "unrecognized strategy number: %d", strategy);
! res = false; /* keep compiler quiet */
! break;
! }
PG_RETURN_BOOL(res);
}
*** a/contrib/pg_trgm/trgm_gist.c
--- b/contrib/pg_trgm/trgm_gist.c
***************
*** 195,225 **** gtrgm_consistent(PG_FUNCTION_ARGS)
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
bool res;
! char *cache = (char *) fcinfo->flinfo->fn_extra;
!
! /* All cases served by this function are exact */
! *recheck = false;
!
! if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0)
{
! qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
if (cache)
pfree(cache);
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
! MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
cache = (char *) fcinfo->flinfo->fn_extra;
! memcpy(cache, query, VARSIZE(query));
! memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
}
! qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
switch (strategy)
{
case SimilarityStrategyNumber:
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
--- 195,254 ----
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
bool res;
! char *cache = (char *) fcinfo->flinfo->fn_extra,
! *cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
! #ifndef IGNORECASE
! if (strategy == ILIKE_STRATEGY)
{
! elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams.");
! }
! #endif
!
! /*
! * Store the both strategy number and extracted trigrams in cache, because
! * trigrams extraction is relatively CPU-expensive. We should store
! * strategy number, because trigrams extraction depends on strategy.
! */
! if (cache == NULL || strategy != *((StrategyNumber *)cache) ||
! VARSIZE(cacheContents) != VARSIZE(query) ||
! memcmp(cacheContents, query, VARSIZE(query)) != 0)
! {
! switch (strategy)
! {
! case SimilarityStrategyNumber:
! qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
! break;
! case LikeStrategyNumber:
! case ILikeStrategyNumber:
! qtrg = generate_wildcard_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
! break;
! default:
! elog(ERROR, "unrecognized strategy number: %d", strategy);
! qtrg = NULL; /* keep compiler quiet */
! break;
! }
if (cache)
pfree(cache);
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
! MAXALIGN(sizeof(StrategyNumber)) + MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
cache = (char *) fcinfo->flinfo->fn_extra;
+ cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
! memcpy(cache, &strategy, sizeof(StrategyNumber));
! memcpy(cacheContents, query, VARSIZE(query));
! memcpy(cacheContents + MAXALIGN(VARSIZE(query)),
! qtrg, VARSIZE(qtrg));
}
! qtrg = (TRGM *) (cacheContents + MAXALIGN(VARSIZE(query)));
switch (strategy)
{
case SimilarityStrategyNumber:
+ /* Similarity search is exact. */
+ *recheck = false;
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
***************
*** 242,247 **** gtrgm_consistent(PG_FUNCTION_ARGS)
--- 271,308 ----
res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false;
}
break;
+ case LikeStrategyNumber:
+ case ILikeStrategyNumber:
+ /*
+ * Wildcard search is inexact. It checks if all of extracted
+ * trigrams can be present if child nodes.
+ */
+ *recheck = true;
+ if (GIST_LEAF(entry))
+ { /* all leafs contains orig trgm */
+ res = trgm_contain(qtrg, key);
+ }
+ else if (ISALLTRUE(key))
+ { /* non-leaf contains signature */
+ res = true;
+ }
+ else
+ { /* non-leaf contains signature */
+ int4 k, tmp = 0, len = ARRNELEM(qtrg);
+ trgm *ptr = GETARR(qtrg);
+ BITVECP sign = GETSIGN(key);
+ res = true;
+ for (k = 0; k < len; k++)
+ {
+ CPTRGM(((char *) &tmp), ptr + k);
+ if (!GETBIT(sign, HASHVAL(tmp)))
+ {
+ res = false;
+ break;
+ }
+ }
+ }
+ break;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
res = false; /* keep compiler quiet */
*** a/contrib/pg_trgm/trgm_op.c
--- b/contrib/pg_trgm/trgm_op.c
***************
*** 236,241 **** generate_trgm(char *str, int slen)
--- 236,443 ----
return trg;
}
+ /*
+ * Extracts part of wildcard, bounded by '_' and '%' meta-characters, non-word
+ * characters or string boundaries, into buf. Returns pointer to the end of
+ * part. Writes byte and character length of found part into correspondingly
+ * bytelen and charlen. If a wildcard part is bounded by non-word characters or
+ * string boundaries then this function will include corresponding padding
+ * spaces into part.
+ */
+ static char *
+ get_wildcard_part(char *str, int lenstr, char *buf, int *bytelen, int *charlen)
+ {
+ char *beginword = str, *endword, *s = buf;
+ bool in_wildcard_meta = false, in_escape = false;
+ int clen;
+
+ /*
+ * Find the first word character remembering whether last charater was
+ * wildcard meta-character.
+ */
+ while (beginword - str < lenstr)
+ {
+ if (in_escape)
+ {
+ in_escape = false;
+ in_wildcard_meta = false;
+ if (iswordchr(beginword)) break;
+ }
+ else
+ {
+ if (ISESCAPECHAR(beginword))
+ in_escape = true;
+ else if (ISWILDCARDCHAR(beginword))
+ in_wildcard_meta = true;
+ else if (iswordchr(beginword))
+ break;
+ else
+ in_wildcard_meta = false;
+ }
+ beginword += pg_mblen(beginword);
+ }
+
+ /*
+ * Add left padding spaces if last chataster wasn't wildcard meta-character.
+ */
+ *charlen = 0;
+ if (!in_wildcard_meta)
+ {
+ if (LPADDING > 0)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ if (LPADDING > 1)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ }
+ }
+ }
+
+ /*
+ * Handle string end.
+ */
+ if (beginword - str >= lenstr)
+ return NULL;
+
+ /*
+ * Copy part of wildcard into buf until wildcard meta-character, non-word
+ * character or string boundary. Escapes is stripping during copy.
+ */
+ endword = beginword;
+ in_wildcard_meta = false;
+ in_escape = false;
+ while (endword - str < lenstr)
+ {
+ clen = pg_mblen(endword);
+ if (in_escape)
+ {
+ in_escape = false;
+ in_wildcard_meta = false;
+ if (iswordchr(endword))
+ {
+ (*charlen)++;
+ memcpy(s, endword, clen);
+ s += clen;
+ }
+ else
+ break;
+ }
+ else
+ {
+ if (ISESCAPECHAR(endword))
+ in_escape = true;
+ else if (ISWILDCARDCHAR(endword))
+ {
+ in_wildcard_meta = true;
+ break;
+ }
+ else if (iswordchr(endword))
+ {
+ (*charlen)++;
+ memcpy(s, endword, clen);
+ s += clen;
+ }
+ else
+ {
+ in_wildcard_meta = false;
+ break;
+ }
+ }
+ endword += clen;
+ }
+
+ /*
+ * Add right padding spaces if last chataster wasn't wildcard meta-character.
+ */
+ if (!in_wildcard_meta)
+ {
+ if (RPADDING > 0)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ if (RPADDING > 1)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ }
+ }
+ }
+ *bytelen = s - buf;
+ return endword;
+ }
+
+ /*
+ * Generates trigrams for wildcard. Returns array of trigrams, which must occur
+ * in any string, which conforms to wildcard. For example, from "a%bcd%" pattern
+ * trigrams " a", "bcd" would be extracted.
+ */
+ TRGM *
+ generate_wildcard_trgm(char *str, int slen)
+ {
+ TRGM *trg;
+ char *buf,
+ *buf2;
+ trgm *tptr;
+ int len,
+ charlen,
+ bytelen;
+ char *eword;
+
+ trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
+ trg->flag = ARRKEY;
+ SET_VARSIZE(trg, TRGMHDRSIZE);
+
+ if (slen + LPADDING + RPADDING < 3 || slen == 0)
+ return trg;
+
+ tptr = GETARR(trg);
+
+ buf = palloc(sizeof(char) * (slen + 4));
+
+ /*
+ * Extract trigrams from every wildcard part extracted by get_wildcard_part.
+ */
+ eword = str;
+ while ((eword = get_wildcard_part(eword, slen - (eword - str),
+ buf, &bytelen, &charlen)) != NULL)
+ {
+ #ifdef IGNORECASE
+ buf2 = lowerstr_with_len(buf, bytelen);
+ bytelen = strlen(buf2);
+ #else
+ buf2 = buf;
+ #endif
+
+ /*
+ * count trigrams
+ */
+ tptr = make_trigrams(tptr, buf2, bytelen, charlen);
+ #ifdef IGNORECASE
+ pfree(buf2);
+ #endif
+ }
+
+ pfree(buf);
+
+ if ((len = tptr - GETARR(trg)) == 0)
+ return trg;
+
+ /*
+ * Make trigrams unique.
+ */
+ if (len > 0)
+ {
+ qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
+ len = unique_array(GETARR(trg), len);
+ }
+
+ SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
+
+ return trg;
+ }
+
uint32
trgm2int(trgm *ptr)
{
***************
*** 340,345 **** cnt_sml(TRGM *trg1, TRGM *trg2)
--- 542,586 ----
}
+ /*
+ * Returns whether trg2 contains all trigrams from trg1.
+ */
+ bool
+ trgm_contain(TRGM *trg1, TRGM *trg2)
+ {
+ trgm *ptr1,
+ *ptr2;
+ int count = 0;
+ int len1,
+ len2;
+
+ ptr1 = GETARR(trg1);
+ ptr2 = GETARR(trg2);
+
+ len1 = ARRNELEM(trg1);
+ len2 = ARRNELEM(trg2);
+
+ while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
+ {
+ int res = CMPTRGM(ptr1, ptr2);
+
+ if (res < 0)
+ return false;
+ else if (res > 0)
+ ptr2++;
+ else
+ {
+ ptr1++;
+ ptr2++;
+ count++;
+ }
+ }
+ if (ptr1 - GETARR(trg1) < len1)
+ return false;
+ else
+ return true;
+ }
+
PG_FUNCTION_INFO_V1(similarity);
Datum similarity(PG_FUNCTION_ARGS);
Datum
*** a/contrib/pg_trgm/uninstall_pg_trgm.sql
--- b/contrib/pg_trgm/uninstall_pg_trgm.sql
***************
*** 27,35 **** DROP OPERATOR CLASS gin_trgm_ops USING gin;
DROP FUNCTION gin_extract_trgm(text, internal);
! DROP FUNCTION gin_extract_trgm(text, internal, int2, internal, internal);
! DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal);
DROP OPERATOR % (text, text);
--- 27,35 ----
DROP FUNCTION gin_extract_trgm(text, internal);
! DROP FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal);
! DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal);
DROP OPERATOR % (text, text);
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers