Hi hackers,

In thread [1] we found that pg_trgm always uses DEFAULT_COLLATION_OID
for converting trigrams to lower-case. Here are some examples where
today the collation is ignored:

CREATE EXSTENSION pg_trgm;
CREATE COLLATION turkish (provider = libc, locale = 'tr_TR.utf8');

postgres=# SELECT show_trgm('ISTANBUL' COLLATE "turkish");
                  show_trgm
---------------------------------------------
 {"  i"," is",anb,bul,ist,nbu,sta,tan,"ul "}

CREATE TABLE test(col TEXT COLLATE "turkish");
INSERT INTO test VALUES ('ISTANBUL');

postgres=# select show_trgm(col) FROM test;
                  show_trgm
---------------------------------------------
 {"  i"," is",anb,bul,ist,nbu,sta,tan,"ul "}

postgres=# SELECT similarity('ıstanbul' COLLATE "turkish", 'ISTANBUL'
COLLATE "turkish");
 similarity
------------
        0.5

If the database is initialized via initdb --locale="tr_TR.utf8", the
output changes:

postgres=# SELECT show_trgm('ISTANBUL');
                       show_trgm
--------------------------------------------------------
 {0xf31e1a,0xfe581d,0x3efd30,anb,bul,nbu,sta,tan,"ul "}

and

postgres=# select show_trgm(col) FROM test;
                       show_trgm
--------------------------------------------------------
 {0xf31e1a,0xfe581d,0x3efd30,anb,bul,nbu,sta,tan,"ul "}

postgres=# SELECT similarity('ıstanbul' COLLATE "turkish", 'ISTANBUL'
COLLATE "turkish");
 similarity
------------
          1

tr_TR.utf8 converts capital I to ı which is a multibyte character, while
my default collation converts I to i.

The attached patch attempts to fix that. I grepped for all occurrences
of DEFAULT_COLLATION_OID in contrib/pg_trgm and use the function's
collation OID instead DEFAULT_COLLATION_OID.

The corresponding regression tests pass.

[1]
https://www.postgresql.org/message-id/e5dd01c6-c469-405d-aea2-feca0b2dc34d%40gmail.com

--
David Geier
From 394762ab49cae50a792c620a6e9393a53f2d476a Mon Sep 17 00:00:00 2001
From: David Geier <[email protected]>
Date: Wed, 21 Jan 2026 14:54:28 +0100
Subject: [PATCH v1] Use correct collation in pg_trgm

---
 contrib/pg_trgm/trgm.h        |  4 +--
 contrib/pg_trgm/trgm_gin.c    |  7 ++--
 contrib/pg_trgm/trgm_gist.c   | 10 +++---
 contrib/pg_trgm/trgm_op.c     | 60 +++++++++++++++++++----------------
 contrib/pg_trgm/trgm_regexp.c | 20 ++++++------
 5 files changed, 54 insertions(+), 47 deletions(-)

diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
index ca017585369..147eefaa3c6 100644
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -119,8 +119,8 @@ extern double strict_word_similarity_threshold;
 extern double index_strategy_get_limit(StrategyNumber strategy);
 extern uint32 trgm2int(trgm *ptr);
 extern void compact_trigram(trgm *tptr, char *str, int bytelen);
-extern TRGM *generate_trgm(char *str, int slen);
-extern TRGM *generate_wildcard_trgm(const char *str, int slen);
+extern TRGM *generate_trgm(char *str, int slen, Oid collation);
+extern TRGM *generate_wildcard_trgm(const char *str, int slen, Oid collation);
 extern float4 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact);
 extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
 extern bool *trgm_presence_map(TRGM *query, TRGM *key);
diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c
index 66ff6adde99..3260c14571e 100644
--- a/contrib/pg_trgm/trgm_gin.c
+++ b/contrib/pg_trgm/trgm_gin.c
@@ -42,7 +42,7 @@ gin_extract_value_trgm(PG_FUNCTION_ARGS)
 
 	*nentries = 0;
 
-	trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
+	trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val), PG_GET_COLLATION());
 	trglen = ARRNELEM(trg);
 
 	if (trglen > 0)
@@ -91,7 +91,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
 		case WordSimilarityStrategyNumber:
 		case StrictWordSimilarityStrategyNumber:
 		case EqualStrategyNumber:
-			trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
+			trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val), PG_GET_COLLATION());
 			break;
 		case ILikeStrategyNumber:
 #ifndef IGNORECASE
@@ -105,7 +105,8 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
 			 * potentially-matching string must include.
 			 */
 			trg = generate_wildcard_trgm(VARDATA_ANY(val),
-										 VARSIZE_ANY_EXHDR(val));
+										 VARSIZE_ANY_EXHDR(val),
+										 PG_GET_COLLATION());
 			break;
 		case RegExpICaseStrategyNumber:
 #ifndef IGNORECASE
diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c
index 5c7deb103a6..27da2f5d6c0 100644
--- a/contrib/pg_trgm/trgm_gist.c
+++ b/contrib/pg_trgm/trgm_gist.c
@@ -123,7 +123,7 @@ gtrgm_compress(PG_FUNCTION_ARGS)
 		TRGM	   *res;
 		text	   *val = DatumGetTextPP(entry->key);
 
-		res = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
+		res = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val), PG_GET_COLLATION());
 		retval = palloc_object(GISTENTRY);
 		gistentryinit(*retval, PointerGetDatum(res),
 					  entry->rel, entry->page,
@@ -241,7 +241,8 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 			case StrictWordSimilarityStrategyNumber:
 			case EqualStrategyNumber:
 				qtrg = generate_trgm(VARDATA(query),
-									 querysize - VARHDRSZ);
+									 querysize - VARHDRSZ,
+									 PG_GET_COLLATION());
 				break;
 			case ILikeStrategyNumber:
 #ifndef IGNORECASE
@@ -250,7 +251,8 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 				/* FALL THRU */
 			case LikeStrategyNumber:
 				qtrg = generate_wildcard_trgm(VARDATA(query),
-											  querysize - VARHDRSZ);
+											  querysize - VARHDRSZ,
+											  PG_GET_COLLATION());
 				break;
 			case RegExpICaseStrategyNumber:
 #ifndef IGNORECASE
@@ -473,7 +475,7 @@ gtrgm_distance(PG_FUNCTION_ARGS)
 	{
 		char	   *newcache;
 
-		qtrg = generate_trgm(VARDATA(query), querysize - VARHDRSZ);
+		qtrg = generate_trgm(VARDATA(query), querysize - VARHDRSZ, PG_GET_COLLATION());
 
 		newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
 									  MAXALIGN(querysize) +
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index 81182a15e07..d4c75caeff9 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -324,7 +324,7 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
  * Returns length of the generated array.
  */
 static int
-generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
+generate_trgm_only(trgm *trg, char *str, int slen, Oid collation, TrgmBound *bounds)
 {
 	trgm	   *tptr;
 	char	   *buf;
@@ -352,7 +352,7 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
 	while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
 	{
 #ifdef IGNORECASE
-		bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
+		bword = str_tolower(bword, eword - bword, collation);
 		bytelen = strlen(bword);
 #else
 		bytelen = eword - bword;
@@ -405,7 +405,7 @@ protect_out_of_mem(int slen)
  * Returns the sorted array of unique trigrams.
  */
 TRGM *
-generate_trgm(char *str, int slen)
+generate_trgm(char *str, int slen, Oid collation)
 {
 	TRGM	   *trg;
 	int			len;
@@ -415,7 +415,7 @@ generate_trgm(char *str, int slen)
 	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
 	trg->flag = ARRKEY;
 
-	len = generate_trgm_only(GETARR(trg), str, slen, NULL);
+	len = generate_trgm_only(GETARR(trg), str, slen, collation, NULL);
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
 
 	if (len == 0)
@@ -671,7 +671,7 @@ iterate_word_similarity(int *trg2indexes,
  */
 static float4
 calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
-					 uint8 flags)
+					 uint8 flags, Oid collation)
 {
 	bool	   *found;
 	pos_trgm   *ptrg;
@@ -697,8 +697,8 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
 	else
 		bounds = NULL;
 
-	len1 = generate_trgm_only(trg1, str1, slen1, NULL);
-	len2 = generate_trgm_only(trg2, str2, slen2, bounds);
+	len1 = generate_trgm_only(trg1, str1, slen1, collation, NULL);
+	len2 = generate_trgm_only(trg2, str2, slen2, collation, bounds);
 
 	ptrg = make_positional_trgm(trg1, len1, trg2, len2);
 	len = len1 + len2;
@@ -915,7 +915,7 @@ get_wildcard_part(const char *str, int lenstr,
  * " a", "bcd" would be extracted.
  */
 TRGM *
-generate_wildcard_trgm(const char *str, int slen)
+generate_wildcard_trgm(const char *str, int slen, Oid collation)
 {
 	TRGM	   *trg;
 	char	   *buf,
@@ -948,7 +948,7 @@ generate_wildcard_trgm(const char *str, int slen)
 									  buf, &bytelen, &charlen)) != NULL)
 	{
 #ifdef IGNORECASE
-		buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
+		buf2 = str_tolower(buf, bytelen, collation);
 		bytelen = strlen(buf2);
 #else
 		buf2 = buf;
@@ -1007,7 +1007,7 @@ show_trgm(PG_FUNCTION_ARGS)
 	trgm	   *ptr;
 	int			i;
 
-	trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
+	trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in), PG_GET_COLLATION());
 	d = palloc_array(Datum, 1 + ARRNELEM(trg));
 
 	for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
@@ -1174,8 +1174,8 @@ similarity(PG_FUNCTION_ARGS)
 			   *trg2;
 	float4		res;
 
-	trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
-	trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
+	trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), PG_GET_COLLATION());
+	trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), PG_GET_COLLATION());
 
 	res = cnt_sml(trg1, trg2, false);
 
@@ -1196,7 +1196,7 @@ word_similarity(PG_FUNCTION_ARGS)
 
 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
-							   0);
+							   0, PG_GET_COLLATION());
 
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
@@ -1212,7 +1212,7 @@ strict_word_similarity(PG_FUNCTION_ARGS)
 
 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
-							   WORD_SIMILARITY_STRICT);
+							   WORD_SIMILARITY_STRICT, PG_GET_COLLATION());
 
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
@@ -1222,9 +1222,10 @@ strict_word_similarity(PG_FUNCTION_ARGS)
 Datum
 similarity_dist(PG_FUNCTION_ARGS)
 {
-	float4		res = DatumGetFloat4(DirectFunctionCall2(similarity,
-														 PG_GETARG_DATUM(0),
-														 PG_GETARG_DATUM(1)));
+	float4		res = DatumGetFloat4(DirectFunctionCall2Coll(similarity,
+															 PG_GET_COLLATION(),
+															 PG_GETARG_DATUM(0),
+															 PG_GETARG_DATUM(1)));
 
 	PG_RETURN_FLOAT4(1.0 - res);
 }
@@ -1232,9 +1233,10 @@ similarity_dist(PG_FUNCTION_ARGS)
 Datum
 similarity_op(PG_FUNCTION_ARGS)
 {
-	float4		res = DatumGetFloat4(DirectFunctionCall2(similarity,
-														 PG_GETARG_DATUM(0),
-														 PG_GETARG_DATUM(1)));
+	float4		res = DatumGetFloat4(DirectFunctionCall2Coll(similarity,
+															 PG_GET_COLLATION(),
+															 PG_GETARG_DATUM(0),
+															 PG_GETARG_DATUM(1)));
 
 	PG_RETURN_BOOL(res >= similarity_threshold);
 }
@@ -1248,7 +1250,7 @@ word_similarity_op(PG_FUNCTION_ARGS)
 
 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
-							   WORD_SIMILARITY_CHECK_ONLY);
+							   WORD_SIMILARITY_CHECK_ONLY, PG_GET_COLLATION());
 
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
@@ -1264,7 +1266,7 @@ word_similarity_commutator_op(PG_FUNCTION_ARGS)
 
 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
 							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
-							   WORD_SIMILARITY_CHECK_ONLY);
+							   WORD_SIMILARITY_CHECK_ONLY, PG_GET_COLLATION());
 
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
@@ -1280,7 +1282,7 @@ word_similarity_dist_op(PG_FUNCTION_ARGS)
 
 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
-							   0);
+							   0, PG_GET_COLLATION());
 
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
@@ -1296,7 +1298,7 @@ word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
 
 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
 							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
-							   0);
+							   0, PG_GET_COLLATION());
 
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
@@ -1312,7 +1314,8 @@ strict_word_similarity_op(PG_FUNCTION_ARGS)
 
 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
-							   WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
+							   WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT,
+							   PG_GET_COLLATION());
 
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
@@ -1328,7 +1331,8 @@ strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
 
 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
 							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
-							   WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
+							   WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT,
+							   PG_GET_COLLATION());
 
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
@@ -1344,7 +1348,7 @@ strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
 
 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
-							   WORD_SIMILARITY_STRICT);
+							   WORD_SIMILARITY_STRICT, PG_GET_COLLATION());
 
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
@@ -1360,7 +1364,7 @@ strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
 
 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
 							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
-							   WORD_SIMILARITY_STRICT);
+							   WORD_SIMILARITY_STRICT, PG_GET_COLLATION());
 
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c
index 1d1b5fe304d..cdd04fa01ad 100644
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -479,11 +479,11 @@ typedef struct
 
 /* prototypes for private functions */
 static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
-								   MemoryContext rcontext);
+								   MemoryContext rcontext, Oid collation);
 static void RE_compile(regex_t *regex, text *text_re,
 					   int cflags, Oid collation);
-static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
-static bool convertPgWchar(pg_wchar c, trgm_mb_char *result);
+static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA, Oid collation);
+static bool convertPgWchar(pg_wchar c, trgm_mb_char *result, Oid collation);
 static void transformGraph(TrgmNFA *trgmNFA);
 static void processState(TrgmNFA *trgmNFA, TrgmState *state);
 static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
@@ -551,7 +551,7 @@ createTrgmNFA(text *text_re, Oid collation,
 			   REG_ADVANCED | REG_NOSUB, collation);
 #endif
 
-	trg = createTrgmNFAInternal(&regex, graph, rcontext);
+	trg = createTrgmNFAInternal(&regex, graph, rcontext, collation);
 
 	/* Clean up all the cruft we created (including regex) */
 	MemoryContextSwitchTo(oldcontext);
@@ -565,7 +565,7 @@ createTrgmNFA(text *text_re, Oid collation,
  */
 static TRGM *
 createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
-					  MemoryContext rcontext)
+					  MemoryContext rcontext, Oid collation)
 {
 	TRGM	   *trg;
 	TrgmNFA		trgmNFA;
@@ -573,7 +573,7 @@ createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
 	trgmNFA.regex = regex;
 
 	/* Collect color information from the regex */
-	getColorInfo(regex, &trgmNFA);
+	getColorInfo(regex, &trgmNFA, collation);
 
 #ifdef TRGM_REGEXP_DEBUG
 	printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors);
@@ -762,7 +762,7 @@ RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
  * Fill TrgmColorInfo structure for each color using regex export functions.
  */
 static void
-getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
+getColorInfo(regex_t *regex, TrgmNFA *trgmNFA, Oid collation)
 {
 	int			colorsCount = pg_reg_getnumcolors(regex);
 	int			i;
@@ -808,7 +808,7 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
 		{
 			trgm_mb_char c;
 
-			if (!convertPgWchar(chars[j], &c))
+			if (!convertPgWchar(chars[j], &c, collation))
 				continue;		/* ok to ignore it altogether */
 			if (ISWORDCHR(c.bytes))
 				colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
@@ -825,7 +825,7 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
  * Returns false if the character should be ignored completely.
  */
 static bool
-convertPgWchar(pg_wchar c, trgm_mb_char *result)
+convertPgWchar(pg_wchar c, trgm_mb_char *result, Oid collation)
 {
 	/* "s" has enough space for a multibyte character and a trailing NUL */
 	char		s[MAX_MULTIBYTE_CHAR_LEN + 1];
@@ -857,7 +857,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
 	 */
 #ifdef IGNORECASE
 	{
-		char	   *lowerCased = str_tolower(s, strlen(s), DEFAULT_COLLATION_OID);
+		char	   *lowerCased = str_tolower(s, strlen(s), collation);
 
 		if (strcmp(lowerCased, s) != 0)
 		{
-- 
2.51.0

Reply via email to