Re: Use CASEFOLD() internally rather than LOWER()

Jeff Davis Tue, 03 Mar 2026 13:02:10 -0800

On Sat, 2026-02-28 at 14:27 +0100, Daniel Verite wrote:
> I tried 0001 with a non-UTF8 database and got quickly stuck:


Attached new versions. I moved the encoding check into the SQL-callable
casefold() function, and other callers use str_casefold(). That
slightly simplifies what happens in ILIKE, also.

I removed the citext changes. citext has somewhat of a legacy status, I
think, so I'm not sure it makes sense to try to modernize or change it.
Also, some SQL-language functions in citext use LOWER(), so the changes
aren't enough: we'd need to make the SQL CASEFOLD function callable in
other encodings, and also run a citext upgrade script to change the
definitions.

Note that these changes affect the result of some expressions (e.g.
ILIKE), so could theoretically make an expression index or predicate
index inconsistent.

Regards,
        Jeff Davis

From a5a1cd1c3cd5a3d7f00b11672052fd9087ea35b0 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 12 Jan 2026 08:58:43 -0800
Subject: [PATCH v2 1/3] ILIKE: use CASEFOLD() rather than LOWER().

For non-C locales, we casefold the entire string before performing
pattern matching with ILIKE. Previously, casefolding was done with the
LOWER() function; now that a proper CASEFOLD() function exists, use
that instead.

CASEFOLD() is better than LOWER() for case-insensitive comparisons in
builtin and ICU locales. For instance, CASEFOLD() transforms a GREEK
SMALL LETTER FINAL SIGMA (U+03C2) into GREEK SMALL LETTER SIGMA
(U+03C3) so that the two characters match in a case-insensitive
comparison; whereas LOWER() does not transform it because it's already
lowercase, so they will not match.
---
 src/backend/utils/adt/formatting.c         |  5 -----
 src/backend/utils/adt/like.c               | 17 ++++++---------
 src/backend/utils/adt/oracle_compat.c      |  5 +++++
 src/test/regress/expected/collate.utf8.out | 24 ++++++++++++++++++++++
 src/test/regress/sql/collate.utf8.sql      |  6 ++++++
 5 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 0716aff22b6..6d0fd26771a 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1829,11 +1829,6 @@ str_casefold(const char *buff, size_t nbytes, Oid collid)
 				 errhint("Use the COLLATE clause to set the collation explicitly.")));
 	}
 
-	if (GetDatabaseEncoding() != PG_UTF8)
-		ereport(ERROR,
-				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("Unicode case folding can only be performed if server encoding is UTF8")));
-
 	mylocale = pg_newlocale_from_collation(collid);
 
 	/* C/POSIX collations use this path regardless of database encoding */
diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index 350bc07f210..8fd67705f1b 100644
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -23,6 +23,7 @@
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "utils/fmgrprotos.h"
+#include "utils/formatting.h"
 #include "utils/pg_locale.h"
 #include "varatt.h"
 
@@ -190,10 +191,8 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)
 				 errmsg("nondeterministic collations are not supported for ILIKE")));
 
 	/*
-	 * For efficiency reasons, in the C locale we don't call lower() on the
+	 * For efficiency reasons, in the C locale we don't call casefold() on the
 	 * pattern and text, but instead lowercase each character lazily.
-	 *
-	 * XXX: use casefolding instead?
 	 */
 
 	if (locale->ctype_is_c)
@@ -206,14 +205,10 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)
 	}
 	else
 	{
-		pat = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation,
-													 PointerGetDatum(pat)));
-		p = VARDATA_ANY(pat);
-		plen = VARSIZE_ANY_EXHDR(pat);
-		str = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation,
-													 PointerGetDatum(str)));
-		s = VARDATA_ANY(str);
-		slen = VARSIZE_ANY_EXHDR(str);
+		p = str_casefold(VARDATA_ANY(pat), VARSIZE_ANY_EXHDR(pat), collation);
+		plen = strlen(p);
+		s = str_casefold(VARDATA_ANY(str), VARSIZE_ANY_EXHDR(str), collation);
+		slen = strlen(s);
 
 		if (GetDatabaseEncoding() == PG_UTF8)
 			return UTF8_MatchText(s, slen, p, plen, 0);
diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c
index 5b0d098bd07..855769d5776 100644
--- a/src/backend/utils/adt/oracle_compat.c
+++ b/src/backend/utils/adt/oracle_compat.c
@@ -133,6 +133,11 @@ casefold(PG_FUNCTION_ARGS)
 	char	   *out_string;
 	text	   *result;
 
+	if (GetDatabaseEncoding() != PG_UTF8)
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("Unicode case folding can only be performed if server encoding is UTF8")));
+
 	out_string = str_casefold(VARDATA_ANY(in_string),
 							  VARSIZE_ANY_EXHDR(in_string),
 							  PG_GET_COLLATION());
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
index 0c3ab5c89b2..3d4292611e2 100644
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -169,6 +169,18 @@ select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_C_UTF8);
  abcd 123 #$% ıiiİ ß ß ǆǆǆ σσσ
 (1 row)
 
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_C_UTF8;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_C_UTF8;
+ ?column? 
+----------
+ f
+(1 row)
+
 --
 -- Test PG_UNICODE_FAST
 --
@@ -338,3 +350,15 @@ select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_UNICODE_FA
  abcd 123 #$% ıiii̇ ss ss ǆǆǆ σσσ
 (1 row)
 
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
index d6d14220ab3..4a5e519cf07 100644
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -85,6 +85,9 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
 -- case folding
 select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_C_UTF8);
 
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_C_UTF8;
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_C_UTF8;
+
 --
 -- Test PG_UNICODE_FAST
 --
@@ -148,3 +151,6 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases re
 
 -- case folding
 select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_UNICODE_FAST);
+
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_UNICODE_FAST;
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_UNICODE_FAST;
-- 
2.43.0

From 580db5a6129c75fc45a054a31c03d3d0e9603f37 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 12 Jan 2026 09:11:56 -0800
Subject: [PATCH v2 2/3] dict_xsyn: use CASEFOLD() rather than LOWER().

CASEFOLD is better for case-insensitive matching in edge cases.
---
 contrib/dict_xsyn/dict_xsyn.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c
index 9e3784e0f47..327c5e6f5ac 100644
--- a/contrib/dict_xsyn/dict_xsyn.c
+++ b/contrib/dict_xsyn/dict_xsyn.c
@@ -98,7 +98,7 @@ read_dictionary(DictSyn *d, const char *filename)
 		if (*line == '\0')
 			continue;
 
-		value = str_tolower(line, strlen(line), DEFAULT_COLLATION_OID);
+		value = str_casefold(line, strlen(line), DEFAULT_COLLATION_OID);
 		pfree(line);
 
 		pos = value;
@@ -215,7 +215,7 @@ dxsyn_lexize(PG_FUNCTION_ARGS)
 	{
 		char	   *temp = pnstrdup(in, length);
 
-		word.key = str_tolower(temp, length, DEFAULT_COLLATION_OID);
+		word.key = str_casefold(temp, length, DEFAULT_COLLATION_OID);
 		pfree(temp);
 		word.value = NULL;
 	}
-- 
2.43.0

From 66fef6d874ed567de664b1a0a63899a8395cd660 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 12 Jan 2026 09:24:16 -0800
Subject: [PATCH v2 3/3] tsearch: use CASEFOLD() rather than LOWER().

CASEFOLD() is better for case-insensitive matching in edge cases.
---
 src/backend/snowball/dict_snowball.c | 4 ++--
 src/backend/tsearch/dict_ispell.c    | 4 ++--
 src/backend/tsearch/dict_simple.c    | 4 ++--
 src/backend/tsearch/dict_synonym.c   | 6 +++---
 src/backend/tsearch/spell.c          | 6 +++---
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c
index 182bd156995..cb2d3061953 100644
--- a/src/backend/snowball/dict_snowball.c
+++ b/src/backend/snowball/dict_snowball.c
@@ -251,7 +251,7 @@ dsnowball_init(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple StopWords parameters")));
-			readstoplist(defGetString(defel), &d->stoplist, str_tolower);
+			readstoplist(defGetString(defel), &d->stoplist, str_casefold);
 			stoploaded = true;
 		}
 		else if (strcmp(defel->defname, "language") == 0)
@@ -287,7 +287,7 @@ dsnowball_lexize(PG_FUNCTION_ARGS)
 	DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
 	char	   *in = (char *) PG_GETARG_POINTER(1);
 	int32		len = PG_GETARG_INT32(2);
-	char	   *txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
+	char	   *txt = str_casefold(in, len, DEFAULT_COLLATION_OID);
 	TSLexeme   *res = palloc0_array(TSLexeme, 2);
 
 	/*
diff --git a/src/backend/tsearch/dict_ispell.c b/src/backend/tsearch/dict_ispell.c
index ad5c26ebccb..bdcfc836e80 100644
--- a/src/backend/tsearch/dict_ispell.c
+++ b/src/backend/tsearch/dict_ispell.c
@@ -79,7 +79,7 @@ dispell_init(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple StopWords parameters")));
-			readstoplist(defGetString(defel), &(d->stoplist), str_tolower);
+			readstoplist(defGetString(defel), &(d->stoplist), str_casefold);
 			stoploaded = true;
 		}
 		else
@@ -128,7 +128,7 @@ dispell_lexize(PG_FUNCTION_ARGS)
 	if (len <= 0)
 		PG_RETURN_POINTER(NULL);
 
-	txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
+	txt = str_casefold(in, len, DEFAULT_COLLATION_OID);
 	res = NINormalizeWord(&(d->obj), txt);
 
 	if (res == NULL)
diff --git a/src/backend/tsearch/dict_simple.c b/src/backend/tsearch/dict_simple.c
index 44d945b2be8..52df5251e20 100644
--- a/src/backend/tsearch/dict_simple.c
+++ b/src/backend/tsearch/dict_simple.c
@@ -48,7 +48,7 @@ dsimple_init(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple StopWords parameters")));
-			readstoplist(defGetString(defel), &d->stoplist, str_tolower);
+			readstoplist(defGetString(defel), &d->stoplist, str_casefold);
 			stoploaded = true;
 		}
 		else if (strcmp(defel->defname, "accept") == 0)
@@ -81,7 +81,7 @@ dsimple_lexize(PG_FUNCTION_ARGS)
 	char	   *txt;
 	TSLexeme   *res;
 
-	txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
+	txt = str_casefold(in, len, DEFAULT_COLLATION_OID);
 
 	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
 	{
diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c
index 3937f25bcc6..44d8ffaf0ec 100644
--- a/src/backend/tsearch/dict_synonym.c
+++ b/src/backend/tsearch/dict_synonym.c
@@ -185,8 +185,8 @@ dsynonym_init(PG_FUNCTION_ARGS)
 		}
 		else
 		{
-			d->syn[cur].in = str_tolower(starti, strlen(starti), DEFAULT_COLLATION_OID);
-			d->syn[cur].out = str_tolower(starto, strlen(starto), DEFAULT_COLLATION_OID);
+			d->syn[cur].in = str_casefold(starti, strlen(starti), DEFAULT_COLLATION_OID);
+			d->syn[cur].out = str_casefold(starto, strlen(starto), DEFAULT_COLLATION_OID);
 		}
 
 		d->syn[cur].outlen = strlen(starto);
@@ -226,7 +226,7 @@ dsynonym_lexize(PG_FUNCTION_ARGS)
 	if (d->case_sensitive)
 		key.in = pnstrdup(in, len);
 	else
-		key.in = str_tolower(in, len, DEFAULT_COLLATION_OID);
+		key.in = str_casefold(in, len, DEFAULT_COLLATION_OID);
 
 	key.out = NULL;
 
diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c
index a1bfd2a9f9b..11b5d5739d1 100644
--- a/src/backend/tsearch/spell.c
+++ b/src/backend/tsearch/spell.c
@@ -170,7 +170,7 @@ cpstrdup(IspellDict *Conf, const char *str)
 
 
 /*
- * Apply str_tolower(), producing a temporary result (in the buildCxt).
+ * Apply str_casefold(), producing a temporary result (in the buildCxt).
  */
 static char *
 lowerstr_ctx(IspellDict *Conf, const char *src)
@@ -179,7 +179,7 @@ lowerstr_ctx(IspellDict *Conf, const char *src)
 	char	   *dst;
 
 	saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
-	dst = str_tolower(src, strlen(src), DEFAULT_COLLATION_OID);
+	dst = str_casefold(src, strlen(src), DEFAULT_COLLATION_OID);
 	MemoryContextSwitchTo(saveCtx);
 
 	return dst;
@@ -1447,7 +1447,7 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
 
 	while ((recoded = tsearch_readline(&trst)) != NULL)
 	{
-		pstr = str_tolower(recoded, strlen(recoded), DEFAULT_COLLATION_OID);
+		pstr = str_casefold(recoded, strlen(recoded), DEFAULT_COLLATION_OID);
 
 		/* Skip comments and empty lines */
 		if (*pstr == '#' || *pstr == '\n')
-- 
2.43.0

Re: Use CASEFOLD() internally rather than LOWER()

Reply via email to