> I've rebased the patches on latest master.
Attached is v6 of the patch which fixes a problem with the tests:
pg_trgm_collation_1.out mistakenly contained \endif at the end of the file.
--
David Geier
From 916e50174bf2df7fbe138803fbbd8f9b435dd2d8 Mon Sep 17 00:00:00 2001
From: David Geier <[email protected]>
Date: Fri, 23 Jan 2026 15:39:06 +0100
Subject: [PATCH v6 2/2] Use correct collation for finding word boundaries
pg_trgm finds all words in the input string and creates trigrams for
them. Word characters are alpha-numeric characters. What qualifies as
alpha-numeric character depends on the collation. Previously, pg_trgm
always used the default collation. Now the specified collation is used
instead.
---
.../pg_trgm/expected/pg_trgm_collation.out | 13 ++++++++++++
contrib/pg_trgm/sql/pg_trgm_collation.sql | 5 +++++
contrib/pg_trgm/trgm.h | 6 +++---
contrib/pg_trgm/trgm_op.c | 20 +++++++++----------
contrib/pg_trgm/trgm_regexp.c | 2 +-
src/backend/tsearch/ts_locale.c | 8 ++++----
src/include/tsearch/ts_locale.h | 3 ++-
7 files changed, 38 insertions(+), 19 deletions(-)
diff --git a/contrib/pg_trgm/expected/pg_trgm_collation.out b/contrib/pg_trgm/expected/pg_trgm_collation.out
index 472ce867665..0cc53edd821 100644
--- a/contrib/pg_trgm/expected/pg_trgm_collation.out
+++ b/contrib/pg_trgm/expected/pg_trgm_collation.out
@@ -41,3 +41,16 @@ SELECT similarity('ıstanbul' COLLATE "C", 'ISTANBUL' COLLATE "C");
0.54545456
(1 row)
+-- Test that word boundary identification uses specified collation
+SELECT show_trgm('helloıtestIworldİcode' COLLATE "tr-x-icu");
+ show_trgm
+-------------------------------------------------------------------------------------------------------------------------------
+ {0x8fc0a2,0x93dfbf,0x1bf43c," h"," he",0x22d44f,0x4398ff,cod,"de ",dic,ell,est,hel,ico,ldi,llo,ode,orl,0x71b8f5,rld,tes,wor}
+(1 row)
+
+SELECT show_trgm('helloıtestIworldİcode' COLLATE "C");
+ show_trgm
+-------------------------------------------------------------------------------------------------------------
+ {" c"," h"," t"," co"," he"," te",cod,"de ",ell,est,hel,iwo,"ld ",llo,"lo ",ode,orl,rld,sti,tes,tiw,wor}
+(1 row)
+
diff --git a/contrib/pg_trgm/sql/pg_trgm_collation.sql b/contrib/pg_trgm/sql/pg_trgm_collation.sql
index afb3973a8b3..e1a5c7c5fa8 100644
--- a/contrib/pg_trgm/sql/pg_trgm_collation.sql
+++ b/contrib/pg_trgm/sql/pg_trgm_collation.sql
@@ -22,3 +22,8 @@ SELECT show_trgm('ISTANBUL' COLLATE "C");
SELECT similarity('ıstanbul' COLLATE "tr-x-icu", 'ISTANBUL' COLLATE "tr-x-icu");
SELECT similarity('ıstanbul' COLLATE "C", 'ISTANBUL' COLLATE "C");
+-- Test that word boundary identification uses specified collation
+
+SELECT show_trgm('helloıtestIworldİcode' COLLATE "tr-x-icu");
+SELECT show_trgm('helloıtestIworldİcode' COLLATE "C");
+
diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
index b6911e91458..3c4db129e20 100644
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -47,9 +47,9 @@ typedef char trgm[3];
} while(0)
extern int (*CMPTRGM) (const void *a, const void *b);
-#define ISWORDCHR(c, len) (t_isalnum_with_len(c, len))
-#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
-#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
+#define ISWORDCHR(c, len, collation) (t_isalnum_with_len_collation(c, len, collation))
+#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
+#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
#define ISESCAPECHAR(x) (*(x) == '\\') /* Wildcard escape character */
#define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%') /* Wildcard
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index 5f2defb57f2..ca30cb7c363 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -292,7 +292,7 @@ comp_trgm(const void *a, const void *b)
* endword points to the character after word
*/
static char *
-find_word(char *str, int lenstr, char **endword)
+find_word(char *str, int lenstr, char **endword, Oid collation)
{
char *beginword = str;
const char *endstr = str + lenstr;
@@ -301,7 +301,7 @@ find_word(char *str, int lenstr, char **endword)
{
int clen = pg_mblen_range(beginword, endstr);
- if (ISWORDCHR(beginword, clen))
+ if (ISWORDCHR(beginword, clen, collation))
break;
beginword += clen;
}
@@ -314,7 +314,7 @@ find_word(char *str, int lenstr, char **endword)
{
int clen = pg_mblen_range(*endword, endstr);
- if (!ISWORDCHR(*endword, clen))
+ if (!ISWORDCHR(*endword, clen, collation))
break;
*endword += clen;
}
@@ -490,7 +490,7 @@ generate_trgm_only(growable_trgm_array *dst, char *str, int slen, Oid collation,
}
eword = str;
- while ((bword = find_word(eword, slen - (eword - str), &eword)) != NULL)
+ while ((bword = find_word(eword, slen - (eword - str), &eword, collation)) != NULL)
{
int oldlen;
@@ -907,7 +907,7 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
*/
static const char *
get_wildcard_part(const char *str, int lenstr,
- char *buf, int *bytelen)
+ char *buf, int *bytelen, Oid collation)
{
const char *beginword = str;
const char *endword;
@@ -930,7 +930,7 @@ get_wildcard_part(const char *str, int lenstr,
if (in_escape)
{
- if (ISWORDCHR(beginword, clen))
+ if (ISWORDCHR(beginword, clen, collation))
break;
in_escape = false;
in_leading_wildcard_meta = false;
@@ -941,7 +941,7 @@ get_wildcard_part(const char *str, int lenstr,
in_escape = true;
else if (ISWILDCARDCHAR(beginword))
in_leading_wildcard_meta = true;
- else if (ISWORDCHR(beginword, clen))
+ else if (ISWORDCHR(beginword, clen, collation))
break;
else
in_leading_wildcard_meta = false;
@@ -979,7 +979,7 @@ get_wildcard_part(const char *str, int lenstr,
clen = pg_mblen_range(endword, endstr);
if (in_escape)
{
- if (ISWORDCHR(endword, clen))
+ if (ISWORDCHR(endword, clen, collation))
{
memcpy(s, endword, clen);
s += clen;
@@ -1006,7 +1006,7 @@ get_wildcard_part(const char *str, int lenstr,
in_trailing_wildcard_meta = true;
break;
}
- else if (ISWORDCHR(endword, clen))
+ else if (ISWORDCHR(endword, clen, collation))
{
memcpy(s, endword, clen);
s += clen;
@@ -1070,7 +1070,7 @@ generate_wildcard_trgm(const char *str, int slen, Oid collation)
*/
eword = str;
while ((eword = get_wildcard_part(eword, slen - (eword - str),
- buf, &bytelen)) != NULL)
+ buf, &bytelen, collation)) != NULL)
{
char *word;
diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c
index 2f190df2f65..57f7b12c3d9 100644
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -811,7 +811,7 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA, Oid collation)
if (!clen)
continue; /* ok to ignore it altogether */
- if (ISWORDCHR(c.bytes, clen))
+ if (ISWORDCHR(c.bytes, clen, collation))
colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
else
colorInfo->containsNonWord = true;
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
index df02ffb12fd..6f331e054a2 100644
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -26,27 +26,27 @@ static void tsearch_readline_callback(void *arg);
#define GENERATE_T_ISCLASS_DEF(character_class) \
/* mblen shall be that of the first character */ \
int \
-t_is##character_class##_with_len(const char *ptr, int mblen) \
+t_is##character_class##_with_len_collation(const char *ptr, int mblen, Oid collation) \
{ \
pg_wchar wstr[WC_BUF_LEN]; \
int wlen pg_attribute_unused(); \
wlen = pg_mb2wchar_with_len(ptr, wstr, mblen); \
Assert(wlen <= 1); \
/* pass single character, or NUL if empty */ \
- return pg_isw##character_class(wstr[0], pg_database_locale()); \
+ return pg_isw##character_class(wstr[0], pg_newlocale_from_collation(collation)); \
} \
\
/* ptr shall point to a NUL-terminated string */ \
int \
t_is##character_class##_cstr(const char *ptr) \
{ \
- return t_is##character_class##_with_len(ptr, pg_mblen_cstr(ptr)); \
+ return t_is##character_class##_with_len_collation(ptr, pg_mblen_cstr(ptr), DEFAULT_COLLATION_OID); \
} \
/* ptr shall point to a string with pre-validated encoding */ \
int \
t_is##character_class##_unbounded(const char *ptr) \
{ \
- return t_is##character_class##_with_len(ptr, pg_mblen_unbounded(ptr)); \
+ return t_is##character_class##_with_len_collation(ptr, pg_mblen_unbounded(ptr), DEFAULT_COLLATION_OID); \
} \
/* historical name for _unbounded */ \
int \
diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h
index 6e2d67ee4a5..7ad7042d523 100644
--- a/src/include/tsearch/ts_locale.h
+++ b/src/include/tsearch/ts_locale.h
@@ -18,6 +18,7 @@
#include "lib/stringinfo.h"
#include "mb/pg_wchar.h"
+#include "catalog/pg_collation.h"
#include "utils/pg_locale.h"
/* working state for tsearch_readline (should be a local var in caller) */
@@ -56,7 +57,7 @@ ts_copychar_cstr(void *dest, const void *src)
#define COPYCHAR ts_copychar_cstr
#define GENERATE_T_ISCLASS_DECL(character_class) \
-extern int t_is##character_class##_with_len(const char *ptr, int len); \
+extern int t_is##character_class##_with_len_collation(const char *ptr, int len, Oid collation); \
extern int t_is##character_class##_cstr(const char *ptr); \
extern int t_is##character_class##_unbounded(const char *ptr); \
\
--
2.51.0
From 3bdef59a6f4ec89b7855dbff1b560fb8753a3130 Mon Sep 17 00:00:00 2001
From: David Geier <[email protected]>
Date: Wed, 21 Jan 2026 14:54:28 +0100
Subject: [PATCH v6 1/2] Use correct collation for lowercasing
pg_trgm converts the input words to lowercase before extracting the
trigrams. The lowercase conversion depends on the collation. Previously,
pg_trgm always used the default collation. Now, the specified collation
is used instead.
---
contrib/pg_trgm/Makefile | 2 +-
.../pg_trgm/expected/pg_trgm_collation.out | 43 +++++++++++++
.../pg_trgm/expected/pg_trgm_collation_1.out | 9 +++
contrib/pg_trgm/meson.build | 1 +
contrib/pg_trgm/sql/pg_trgm_collation.sql | 24 ++++++++
contrib/pg_trgm/trgm.h | 4 +-
contrib/pg_trgm/trgm_gin.c | 7 ++-
contrib/pg_trgm/trgm_gist.c | 10 ++--
contrib/pg_trgm/trgm_op.c | 60 ++++++++++---------
contrib/pg_trgm/trgm_regexp.c | 20 +++----
10 files changed, 132 insertions(+), 48 deletions(-)
create mode 100644 contrib/pg_trgm/expected/pg_trgm_collation.out
create mode 100644 contrib/pg_trgm/expected/pg_trgm_collation_1.out
create mode 100644 contrib/pg_trgm/sql/pg_trgm_collation.sql
diff --git a/contrib/pg_trgm/Makefile b/contrib/pg_trgm/Makefile
index c1756993ec7..26b3028b75e 100644
--- a/contrib/pg_trgm/Makefile
+++ b/contrib/pg_trgm/Makefile
@@ -14,7 +14,7 @@ DATA = pg_trgm--1.5--1.6.sql pg_trgm--1.4--1.5.sql pg_trgm--1.3--1.4.sql \
pg_trgm--1.0--1.1.sql
PGFILEDESC = "pg_trgm - trigram matching"
-REGRESS = pg_trgm pg_utf8_trgm pg_word_trgm pg_strict_word_trgm
+REGRESS = pg_trgm pg_utf8_trgm pg_word_trgm pg_strict_word_trgm pg_trgm_collation
ifdef USE_PGXS
PG_CONFIG = pg_config
diff --git a/contrib/pg_trgm/expected/pg_trgm_collation.out b/contrib/pg_trgm/expected/pg_trgm_collation.out
new file mode 100644
index 00000000000..472ce867665
--- /dev/null
+++ b/contrib/pg_trgm/expected/pg_trgm_collation.out
@@ -0,0 +1,43 @@
+/*
+ * This test is for ICU collations.
+ */
+/* skip test if not UTF8 server encoding or no ICU collations installed */
+SELECT getdatabaseencoding() <> 'UTF8' OR
+ (SELECT count(*) FROM pg_collation WHERE collprovider = 'i' AND collname <> 'unicode') = 0
+ AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+-- Test that lowercase conversion of trigrams uses specified collation
+CREATE TABLE test(col TEXT COLLATE "tr-x-icu");
+INSERT INTO test VALUES ('ISTANBUL');
+SELECT show_trgm(col) FROM test;
+ show_trgm
+--------------------------------------------------------
+ {0xf31e1a,0xfe581d,0x3efd30,anb,bul,nbu,sta,tan,"ul "}
+(1 row)
+
+SELECT show_trgm('ISTANBUL' COLLATE "tr-x-icu");
+ show_trgm
+--------------------------------------------------------
+ {0xf31e1a,0xfe581d,0x3efd30,anb,bul,nbu,sta,tan,"ul "}
+(1 row)
+
+SELECT show_trgm('ISTANBUL' COLLATE "C");
+ show_trgm
+---------------------------------------------
+ {" i"," is",anb,bul,ist,nbu,sta,tan,"ul "}
+(1 row)
+
+SELECT similarity('ıstanbul' COLLATE "tr-x-icu", 'ISTANBUL' COLLATE "tr-x-icu");
+ similarity
+------------
+ 1
+(1 row)
+
+SELECT similarity('ıstanbul' COLLATE "C", 'ISTANBUL' COLLATE "C");
+ similarity
+------------
+ 0.54545456
+(1 row)
+
diff --git a/contrib/pg_trgm/expected/pg_trgm_collation_1.out b/contrib/pg_trgm/expected/pg_trgm_collation_1.out
new file mode 100644
index 00000000000..25c99c4abf0
--- /dev/null
+++ b/contrib/pg_trgm/expected/pg_trgm_collation_1.out
@@ -0,0 +1,9 @@
+/*
+ * This test is for ICU collations.
+ */
+/* skip test if not UTF8 server encoding or no ICU collations installed */
+SELECT getdatabaseencoding() <> 'UTF8' OR
+ (SELECT count(*) FROM pg_collation WHERE collprovider = 'i' AND collname <> 'unicode') = 0
+ AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/contrib/pg_trgm/meson.build b/contrib/pg_trgm/meson.build
index 3ecf95ba862..5eafa774435 100644
--- a/contrib/pg_trgm/meson.build
+++ b/contrib/pg_trgm/meson.build
@@ -42,6 +42,7 @@ tests += {
'pg_utf8_trgm',
'pg_word_trgm',
'pg_strict_word_trgm',
+ 'pg_trgm_collation',
],
},
}
diff --git a/contrib/pg_trgm/sql/pg_trgm_collation.sql b/contrib/pg_trgm/sql/pg_trgm_collation.sql
new file mode 100644
index 00000000000..afb3973a8b3
--- /dev/null
+++ b/contrib/pg_trgm/sql/pg_trgm_collation.sql
@@ -0,0 +1,24 @@
+/*
+ * This test is for ICU collations.
+ */
+
+/* skip test if not UTF8 server encoding or no ICU collations installed */
+SELECT getdatabaseencoding() <> 'UTF8' OR
+ (SELECT count(*) FROM pg_collation WHERE collprovider = 'i' AND collname <> 'unicode') = 0
+ AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+-- Test that lowercase conversion of trigrams uses specified collation
+
+CREATE TABLE test(col TEXT COLLATE "tr-x-icu");
+INSERT INTO test VALUES ('ISTANBUL');
+SELECT show_trgm(col) FROM test;
+SELECT show_trgm('ISTANBUL' COLLATE "tr-x-icu");
+
+SELECT show_trgm('ISTANBUL' COLLATE "C");
+
+SELECT similarity('ıstanbul' COLLATE "tr-x-icu", 'ISTANBUL' COLLATE "tr-x-icu");
+SELECT similarity('ıstanbul' COLLATE "C", 'ISTANBUL' COLLATE "C");
+
diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
index ca23aad4dd9..b6911e91458 100644
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -119,8 +119,8 @@ extern double strict_word_similarity_threshold;
extern double index_strategy_get_limit(StrategyNumber strategy);
extern uint32 trgm2int(trgm *ptr);
extern void compact_trigram(trgm *tptr, char *str, int bytelen);
-extern TRGM *generate_trgm(char *str, int slen);
-extern TRGM *generate_wildcard_trgm(const char *str, int slen);
+extern TRGM *generate_trgm(char *str, int slen, Oid collation);
+extern TRGM *generate_wildcard_trgm(const char *str, int slen, Oid collation);
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact);
extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
extern bool *trgm_presence_map(TRGM *query, TRGM *key);
diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c
index 5766b3e9955..14a892c657d 100644
--- a/contrib/pg_trgm/trgm_gin.c
+++ b/contrib/pg_trgm/trgm_gin.c
@@ -42,7 +42,7 @@ gin_extract_value_trgm(PG_FUNCTION_ARGS)
*nentries = 0;
- trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
+ trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val), PG_GET_COLLATION());
trglen = ARRNELEM(trg);
if (trglen > 0)
@@ -93,7 +93,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
case WordSimilarityStrategyNumber:
case StrictWordSimilarityStrategyNumber:
case EqualStrategyNumber:
- trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
+ trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val), PG_GET_COLLATION());
break;
case ILikeStrategyNumber:
#ifndef IGNORECASE
@@ -107,7 +107,8 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
* potentially-matching string must include.
*/
trg = generate_wildcard_trgm(VARDATA_ANY(val),
- VARSIZE_ANY_EXHDR(val));
+ VARSIZE_ANY_EXHDR(val),
+ PG_GET_COLLATION());
break;
case RegExpICaseStrategyNumber:
#ifndef IGNORECASE
diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c
index 11812b2984e..d9102400442 100644
--- a/contrib/pg_trgm/trgm_gist.c
+++ b/contrib/pg_trgm/trgm_gist.c
@@ -123,7 +123,7 @@ gtrgm_compress(PG_FUNCTION_ARGS)
TRGM *res;
text *val = DatumGetTextPP(entry->key);
- res = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
+ res = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val), PG_GET_COLLATION());
retval = palloc_object(GISTENTRY);
gistentryinit(*retval, PointerGetDatum(res),
entry->rel, entry->page,
@@ -242,7 +242,8 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
case StrictWordSimilarityStrategyNumber:
case EqualStrategyNumber:
qtrg = generate_trgm(VARDATA(query),
- querysize - VARHDRSZ);
+ querysize - VARHDRSZ,
+ PG_GET_COLLATION());
break;
case ILikeStrategyNumber:
#ifndef IGNORECASE
@@ -251,7 +252,8 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
pg_fallthrough;
case LikeStrategyNumber:
qtrg = generate_wildcard_trgm(VARDATA(query),
- querysize - VARHDRSZ);
+ querysize - VARHDRSZ,
+ PG_GET_COLLATION());
break;
case RegExpICaseStrategyNumber:
#ifndef IGNORECASE
@@ -475,7 +477,7 @@ gtrgm_distance(PG_FUNCTION_ARGS)
{
char *newcache;
- qtrg = generate_trgm(VARDATA(query), querysize - VARHDRSZ);
+ qtrg = generate_trgm(VARDATA(query), querysize - VARHDRSZ, PG_GET_COLLATION());
newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
MAXALIGN(querysize) +
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index ee89e548d16..5f2defb57f2 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -447,7 +447,7 @@ done:
* bounds_p: where to return bounds of trigrams (if needed).
*/
static void
-generate_trgm_only(growable_trgm_array *dst, char *str, int slen, TrgmBound **bounds_p)
+generate_trgm_only(growable_trgm_array *dst, char *str, int slen, Oid collation, TrgmBound **bounds_p)
{
size_t buflen;
char *buf;
@@ -499,7 +499,7 @@ generate_trgm_only(growable_trgm_array *dst, char *str, int slen, TrgmBound **bo
{
char *lowered;
- lowered = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
+ lowered = str_tolower(bword, eword - bword, collation);
bytelen = strlen(lowered);
/* grow the buffer if necessary */
@@ -553,13 +553,13 @@ generate_trgm_only(growable_trgm_array *dst, char *str, int slen, TrgmBound **bo
* Returns the sorted array of unique trigrams.
*/
TRGM *
-generate_trgm(char *str, int slen)
+generate_trgm(char *str, int slen, Oid collation)
{
TRGM *trg;
growable_trgm_array arr;
int len;
- generate_trgm_only(&arr, str, slen, NULL);
+ generate_trgm_only(&arr, str, slen, collation, NULL);
len = arr.length;
trg = arr.datum;
trg->flag = ARRKEY;
@@ -814,7 +814,7 @@ iterate_word_similarity(int *trg2indexes,
*/
static float4
calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
- uint8 flags)
+ uint8 flags, Oid collation)
{
bool *found;
pos_trgm *ptrg;
@@ -832,9 +832,9 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
/* Make positional trigrams */
- generate_trgm_only(&trg1, str1, slen1, NULL);
+ generate_trgm_only(&trg1, str1, slen1, collation, NULL);
len1 = trg1.length;
- generate_trgm_only(&trg2, str2, slen2, (flags & WORD_SIMILARITY_STRICT) ? &bounds : NULL);
+ generate_trgm_only(&trg2, str2, slen2, collation, (flags & WORD_SIMILARITY_STRICT) ? &bounds : NULL);
len2 = trg2.length;
ptrg = make_positional_trgm(GETARR(trg1.datum), len1, GETARR(trg2.datum), len2);
@@ -1043,7 +1043,7 @@ get_wildcard_part(const char *str, int lenstr,
* " a", "bcd" would be extracted.
*/
TRGM *
-generate_wildcard_trgm(const char *str, int slen)
+generate_wildcard_trgm(const char *str, int slen, Oid collation)
{
TRGM *trg;
growable_trgm_array arr;
@@ -1075,7 +1075,7 @@ generate_wildcard_trgm(const char *str, int slen)
char *word;
#ifdef IGNORECASE
- word = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
+ word = str_tolower(buf, bytelen, collation);
bytelen = strlen(word);
#else
word = buf;
@@ -1134,7 +1134,7 @@ show_trgm(PG_FUNCTION_ARGS)
trgm *ptr;
int i;
- trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
+ trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in), PG_GET_COLLATION());
d = palloc_array(Datum, 1 + ARRNELEM(trg));
for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
@@ -1301,8 +1301,8 @@ similarity(PG_FUNCTION_ARGS)
*trg2;
float4 res;
- trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
- trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
+ trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), PG_GET_COLLATION());
+ trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), PG_GET_COLLATION());
res = cnt_sml(trg1, trg2, false);
@@ -1323,7 +1323,7 @@ word_similarity(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
- 0);
+ 0, PG_GET_COLLATION());
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
@@ -1339,7 +1339,7 @@ strict_word_similarity(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
- WORD_SIMILARITY_STRICT);
+ WORD_SIMILARITY_STRICT, PG_GET_COLLATION());
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
@@ -1349,9 +1349,10 @@ strict_word_similarity(PG_FUNCTION_ARGS)
Datum
similarity_dist(PG_FUNCTION_ARGS)
{
- float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
- PG_GETARG_DATUM(0),
- PG_GETARG_DATUM(1)));
+ float4 res = DatumGetFloat4(DirectFunctionCall2Coll(similarity,
+ PG_GET_COLLATION(),
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1)));
PG_RETURN_FLOAT4(1.0 - res);
}
@@ -1359,9 +1360,10 @@ similarity_dist(PG_FUNCTION_ARGS)
Datum
similarity_op(PG_FUNCTION_ARGS)
{
- float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
- PG_GETARG_DATUM(0),
- PG_GETARG_DATUM(1)));
+ float4 res = DatumGetFloat4(DirectFunctionCall2Coll(similarity,
+ PG_GET_COLLATION(),
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1)));
PG_RETURN_BOOL(res >= similarity_threshold);
}
@@ -1375,7 +1377,7 @@ word_similarity_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
- WORD_SIMILARITY_CHECK_ONLY);
+ WORD_SIMILARITY_CHECK_ONLY, PG_GET_COLLATION());
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
@@ -1391,7 +1393,7 @@ word_similarity_commutator_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
- WORD_SIMILARITY_CHECK_ONLY);
+ WORD_SIMILARITY_CHECK_ONLY, PG_GET_COLLATION());
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
@@ -1407,7 +1409,7 @@ word_similarity_dist_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
- 0);
+ 0, PG_GET_COLLATION());
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
@@ -1423,7 +1425,7 @@ word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
- 0);
+ 0, PG_GET_COLLATION());
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
@@ -1439,7 +1441,8 @@ strict_word_similarity_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
- WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
+ WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT,
+ PG_GET_COLLATION());
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
@@ -1455,7 +1458,8 @@ strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
- WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
+ WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT,
+ PG_GET_COLLATION());
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
@@ -1471,7 +1475,7 @@ strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
- WORD_SIMILARITY_STRICT);
+ WORD_SIMILARITY_STRICT, PG_GET_COLLATION());
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
@@ -1487,7 +1491,7 @@ strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
- WORD_SIMILARITY_STRICT);
+ WORD_SIMILARITY_STRICT, PG_GET_COLLATION());
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c
index efee4cf5fb4..2f190df2f65 100644
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -479,11 +479,11 @@ typedef struct
/* prototypes for private functions */
static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
- MemoryContext rcontext);
+ MemoryContext rcontext, Oid collation);
static void RE_compile(regex_t *regex, text *text_re,
int cflags, Oid collation);
-static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
-static int convertPgWchar(pg_wchar c, trgm_mb_char *result);
+static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA, Oid collation);
+static int convertPgWchar(pg_wchar c, trgm_mb_char *result, Oid collation);
static void transformGraph(TrgmNFA *trgmNFA);
static void processState(TrgmNFA *trgmNFA, TrgmState *state);
static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
@@ -551,7 +551,7 @@ createTrgmNFA(text *text_re, Oid collation,
REG_ADVANCED | REG_NOSUB, collation);
#endif
- trg = createTrgmNFAInternal(®ex, graph, rcontext);
+ trg = createTrgmNFAInternal(®ex, graph, rcontext, collation);
/* Clean up all the cruft we created (including regex) */
MemoryContextSwitchTo(oldcontext);
@@ -565,7 +565,7 @@ createTrgmNFA(text *text_re, Oid collation,
*/
static TRGM *
createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
- MemoryContext rcontext)
+ MemoryContext rcontext, Oid collation)
{
TRGM *trg;
TrgmNFA trgmNFA;
@@ -573,7 +573,7 @@ createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
trgmNFA.regex = regex;
/* Collect color information from the regex */
- getColorInfo(regex, &trgmNFA);
+ getColorInfo(regex, &trgmNFA, collation);
#ifdef TRGM_REGEXP_DEBUG
printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors);
@@ -762,7 +762,7 @@ RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
* Fill TrgmColorInfo structure for each color using regex export functions.
*/
static void
-getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
+getColorInfo(regex_t *regex, TrgmNFA *trgmNFA, Oid collation)
{
int colorsCount = pg_reg_getnumcolors(regex);
int i;
@@ -807,7 +807,7 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
for (j = 0; j < charsCount; j++)
{
trgm_mb_char c;
- int clen = convertPgWchar(chars[j], &c);
+ int clen = convertPgWchar(chars[j], &c, collation);
if (!clen)
continue; /* ok to ignore it altogether */
@@ -827,7 +827,7 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
* byte length.
*/
static int
-convertPgWchar(pg_wchar c, trgm_mb_char *result)
+convertPgWchar(pg_wchar c, trgm_mb_char *result, Oid collation)
{
/* "s" has enough space for a multibyte character and a trailing NUL */
char s[MAX_MULTIBYTE_CHAR_LEN + 1];
@@ -860,7 +860,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
*/
#ifdef IGNORECASE
{
- char *lowerCased = str_tolower(s, clen, DEFAULT_COLLATION_OID);
+ char *lowerCased = str_tolower(s, clen, collation);
if (strcmp(lowerCased, s) != 0)
{
--
2.51.0