Re: Remaining dependency on setlocale()

Jeff Davis Thu, 20 Nov 2025 16:58:44 -0800

On Wed, 2025-11-12 at 19:59 +0100, Peter Eisentraut wrote:
> Many of these issues are pre-existing, but I just figured it has
> reached 
> a point where we need to do something about it.


I tried to simplify things in this patch series, assuming that we have
some tolerance for small behavior changes.

0001: No behavior change here, same patch as before. Uncontroversial
simplification, so I plan to commit this soon.

0002: change fuzzystrmatch to use ASCII semantics. As far as I can
tell, this only affects the results of soundex(). Before the patch, in
en_US.iso885915, soundex('réd') was 'RÉ30', after the patch it's
'Ré30'. I'm not sure whether the current behavior is intentional or
not. Other functions (daitch_mokotoff, levenshtein, and metaphone) are
unaffected as far as I can tell.

0003+0005: change ltree to use case folding instead of tolower(). I
believe this is a bug fix, because the current code is inconsistent
between ltree_strncasecmp() and ltree_crc32_sz().

0006-0007: Remove char_tolower() API. This also removes the
optimization for single-byte encodings with the libc provider and a
non-C locale, but simplifies the code (the optimization is retained for
the C locale). It's possible to make the lazy-folding optimization work
for all locales without the char_tolower() API by doing something
simlar to what 0004 does for ltree. But to make this work efficiently
for Generic_Text_IC_like() would be a bit more complex: we'd need to
adjust MatchText() to be able to fold the arguments lazily, and perhaps
introduce some kind of casemapping iterator. That's already a pretty
complex function, so I'm hesitant to do that work unless the
optimization is important.

These patches don't get us quite to the point of eliminating the
LC_CTYPE dependency (there's still downcase_identifier() and
pg_strcasecmp() to worry about, and some assorted isxyz() calls to
examine), but they simplify things enough that the path forward will be
easier.

Regards,
        Jeff Davis

From 82ef752da7f25d0d718f98ef74748a3b3555d1df Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Sun, 26 Oct 2025 14:58:02 -0700
Subject: [PATCH v8 1/7] Avoid global LC_CTYPE dependency in pg_locale_libc.c.

Call tolower_l() directly instead of through pg_tolower(), because the
latter depends on the global LC_CTYPE.
---
 src/backend/utils/adt/pg_locale_libc.c | 28 ++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 9c7fcd1fc7a..716f005066a 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -450,7 +450,12 @@ strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
 		for (p = dest; *p; p++)
 		{
 			if (locale->is_default)
-				*p = pg_tolower((unsigned char) *p);
+			{
+				if (*p >= 'A' && *p <= 'Z')
+					*p += 'a' - 'A';
+				else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
+					*p = tolower_l((unsigned char) *p, loc);
+			}
 			else
 				*p = tolower_l((unsigned char) *p, loc);
 		}
@@ -535,9 +540,19 @@ strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
 			if (locale->is_default)
 			{
 				if (wasalnum)
-					*p = pg_tolower((unsigned char) *p);
+				{
+					if (*p >= 'A' && *p <= 'Z')
+						*p += 'a' - 'A';
+					else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
+						*p = tolower_l((unsigned char) *p, loc);
+				}
 				else
-					*p = pg_toupper((unsigned char) *p);
+				{
+					if (*p >= 'a' && *p <= 'z')
+						*p -= 'a' - 'A';
+					else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
+						*p = toupper_l((unsigned char) *p, loc);
+				}
 			}
 			else
 			{
@@ -633,7 +648,12 @@ strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
 		for (p = dest; *p; p++)
 		{
 			if (locale->is_default)
-				*p = pg_toupper((unsigned char) *p);
+			{
+				if (*p >= 'a' && *p <= 'z')
+					*p -= 'a' - 'A';
+				else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
+					*p = toupper_l((unsigned char) *p, loc);
+			}
 			else
 				*p = toupper_l((unsigned char) *p, loc);
 		}
-- 
2.43.0

From 09b3be1438da3561562042b86985439f7a206bf1 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Wed, 19 Nov 2025 13:24:38 -0800
Subject: [PATCH v8 2/7] fuzzystrmatch: use pg_ascii_toupper().

fuzzystrmatch is designed for ASCII, so no need to rely on the global
LC_CTYPE setting.
---
 contrib/fuzzystrmatch/dmetaphone.c    |  2 +-
 contrib/fuzzystrmatch/fuzzystrmatch.c | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/contrib/fuzzystrmatch/dmetaphone.c b/contrib/fuzzystrmatch/dmetaphone.c
index 6627b2b8943..bb5d3e90756 100644
--- a/contrib/fuzzystrmatch/dmetaphone.c
+++ b/contrib/fuzzystrmatch/dmetaphone.c
@@ -284,7 +284,7 @@ MakeUpper(metastring *s)
 	char	   *i;
 
 	for (i = s->str; *i; i++)
-		*i = toupper((unsigned char) *i);
+		*i = pg_ascii_toupper((unsigned char) *i);
 }
 
 
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c
index e7cc314b763..7f07efc2c35 100644
--- a/contrib/fuzzystrmatch/fuzzystrmatch.c
+++ b/contrib/fuzzystrmatch/fuzzystrmatch.c
@@ -62,7 +62,7 @@ static const char *const soundex_table = "01230120022455012623010202";
 static char
 soundex_code(char letter)
 {
-	letter = toupper((unsigned char) letter);
+	letter = pg_ascii_toupper((unsigned char) letter);
 	/* Defend against non-ASCII letters */
 	if (letter >= 'A' && letter <= 'Z')
 		return soundex_table[letter - 'A'];
@@ -124,7 +124,7 @@ getcode(char c)
 {
 	if (isalpha((unsigned char) c))
 	{
-		c = toupper((unsigned char) c);
+		c = pg_ascii_toupper((unsigned char) c);
 		/* Defend against non-ASCII letters */
 		if (c >= 'A' && c <= 'Z')
 			return _codes[c - 'A'];
@@ -301,18 +301,18 @@ metaphone(PG_FUNCTION_ARGS)
  * accessing the array directly... */
 
 /* Look at the next letter in the word */
-#define Next_Letter (toupper((unsigned char) word[w_idx+1]))
+#define Next_Letter (pg_ascii_toupper((unsigned char) word[w_idx+1]))
 /* Look at the current letter in the word */
-#define Curr_Letter (toupper((unsigned char) word[w_idx]))
+#define Curr_Letter (pg_ascii_toupper((unsigned char) word[w_idx]))
 /* Go N letters back. */
 #define Look_Back_Letter(n) \
-	(w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
+	(w_idx >= (n) ? pg_ascii_toupper((unsigned char) word[w_idx-(n)]) : '\0')
 /* Previous letter.  I dunno, should this return null on failure? */
 #define Prev_Letter (Look_Back_Letter(1))
 /* Look two letters down.  It makes sure you don't walk off the string. */
 #define After_Next_Letter \
-	(Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
-#define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
+	(Next_Letter != '\0' ? pg_ascii_toupper((unsigned char) word[w_idx+2]) : '\0')
+#define Look_Ahead_Letter(n) pg_ascii_toupper((unsigned char) Lookahead(word+w_idx, n))
 
 
 /* Allows us to safely look ahead an arbitrary # of letters */
@@ -742,7 +742,7 @@ _soundex(const char *instr, char *outstr)
 	}
 
 	/* Take the first letter as is */
-	*outstr++ = (char) toupper((unsigned char) *instr++);
+	*outstr++ = (char) pg_ascii_toupper((unsigned char) *instr++);
 
 	count = 1;
 	while (*instr && count < SOUNDEX_LEN)
-- 
2.43.0

From 7190291ec2acfab55f90504cc3a9c13bafc87364 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Wed, 19 Nov 2025 10:11:52 -0800
Subject: [PATCH v8 3/7] Add #define for UNICODE_CASEMAP_BUFSZ.

Useful for mapping a single codepoint at a time into a
statically-allocated buffer.
---
 src/include/utils/pg_locale.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 683e1a0eef8..49fd22bf8eb 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -26,6 +26,17 @@
 /* use for libc locale names */
 #define LOCALE_NAME_BUFLEN 128
 
+/*
+ * Maximum number of bytes needed to map a single codepoint. Useful for
+ * mapping and processing a single input codepoint at a time with a
+ * statically-allocated buffer.
+ *
+ * With full case mapping, an input codepoint may be mapped to as many as
+ * three output codepoints. See Unicode 5.18.2, "Change in Length".
+ */
+#define UNICODE_CASEMAP_LEN		3
+#define UNICODE_CASEMAP_BUFSZ	(UNICODE_CASEMAP_LEN * sizeof(char32_t))
+
 /* GUC settings */
 extern PGDLLIMPORT char *locale_messages;
 extern PGDLLIMPORT char *locale_monetary;
-- 
2.43.0

From 735ee6342c2365f879c47c3aa0867c58174402aa Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Fri, 7 Nov 2025 12:11:34 -0800
Subject: [PATCH v8 4/7] Allow pg_locale_t APIs to work when ctype_is_c.

Previously, the caller needed to check ctype_is_c first for some
routines and not others. Now, the APIs consistently work, and the
caller can just check ctype_is_c for optimization purposes.
---
 src/backend/utils/adt/like_support.c   | 34 ++++----------
 src/backend/utils/adt/pg_locale.c      | 63 ++++++++++++++++++++++++--
 src/backend/utils/adt/pg_locale_libc.c |  3 ++
 3 files changed, 72 insertions(+), 28 deletions(-)

diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c
index 999f23f86d5..0debccfa67b 100644
--- a/src/backend/utils/adt/like_support.c
+++ b/src/backend/utils/adt/like_support.c
@@ -99,8 +99,6 @@ static Selectivity like_selectivity(const char *patt, int pattlen,
 static Selectivity regex_selectivity(const char *patt, int pattlen,
 									 bool case_insensitive,
 									 int fixed_prefix_len);
-static int	pattern_char_isalpha(char c, bool is_multibyte,
-								 pg_locale_t locale);
 static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
 								  Oid collation);
 static Datum string_to_datum(const char *str, Oid datatype);
@@ -995,7 +993,6 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 	Oid			typeid = patt_const->consttype;
 	int			pos,
 				match_pos;
-	bool		is_multibyte = (pg_database_encoding_max_length() > 1);
 	pg_locale_t locale = 0;
 
 	/* the right-hand const is type text or bytea */
@@ -1055,9 +1052,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 				break;
 		}
 
-		/* Stop if case-varying character (it's sort of a wildcard) */
-		if (case_insensitive &&
-			pattern_char_isalpha(patt[pos], is_multibyte, locale))
+		/*
+		 * Stop if case-varying character (it's sort of a wildcard).
+		 *
+		 * In multibyte character sets or with non-libc providers, we can't
+		 * use isalpha, and it does not seem worth trying to convert to
+		 * wchar_t or char32_t.  Instead, just pass the single byte to the
+		 * provider, which will assume any non-ASCII char is potentially
+		 * case-varying.
+		 */
+		if (case_insensitive && char_is_cased(patt[pos], locale))
 			break;
 
 		match[match_pos++] = patt[pos];
@@ -1481,24 +1485,6 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
 	return sel;
 }
 
-/*
- * Check whether char is a letter (and, hence, subject to case-folding)
- *
- * In multibyte character sets or with ICU, we can't use isalpha, and it does
- * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
- * Instead, just assume any non-ASCII char is potentially case-varying, and
- * hard-wire knowledge of which ASCII chars are letters.
- */
-static int
-pattern_char_isalpha(char c, bool is_multibyte,
-					 pg_locale_t locale)
-{
-	if (locale->ctype_is_c)
-		return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
-	else
-		return char_is_cased(c, locale);
-}
-
 
 /*
  * For bytea, the increment function need only increment the current byte
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index b14c7837938..9319fb633b6 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1261,6 +1261,17 @@ size_t
 pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 			pg_locale_t locale)
 {
+	if (locale->ctype == NULL)
+	{
+		int			i;
+
+		srclen = (srclen >= 0) ? srclen : strlen(src);
+		for (i = 0; i < srclen && i < dstsize; i++)
+			dst[i] = pg_ascii_tolower(src[i]);
+		if (i < dstsize)
+			dst[i] = '\0';
+		return srclen;
+	}
 	return locale->ctype->strlower(dst, dstsize, src, srclen, locale);
 }
 
@@ -1268,6 +1279,29 @@ size_t
 pg_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 			pg_locale_t locale)
 {
+	if (locale->ctype == NULL)
+	{
+		bool		wasalnum = false;
+		int			i;
+
+		srclen = (srclen >= 0) ? srclen : strlen(src);
+		for (i = 0; i < Min(srclen, dstsize); i++)
+		{
+			char		c = src[i];
+
+			if (wasalnum)
+				dst[i] = pg_ascii_tolower(c);
+			else
+				dst[i] = pg_ascii_toupper(c);
+
+			wasalnum = ((c >= '0' && c <= '9') ||
+						(c >= 'A' && c <= 'Z') ||
+						(c >= 'a' && c <= 'z'));
+		}
+		if (i < dstsize)
+			dst[i] = '\0';
+		return srclen;
+	}
 	return locale->ctype->strtitle(dst, dstsize, src, srclen, locale);
 }
 
@@ -1275,6 +1309,17 @@ size_t
 pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 			pg_locale_t locale)
 {
+	if (locale->ctype == NULL)
+	{
+		int			i;
+
+		srclen = (srclen >= 0) ? srclen : strlen(src);
+		for (i = 0; i < srclen && i < dstsize; i++)
+			dst[i] = pg_ascii_toupper(src[i]);
+		if (i < dstsize)
+			dst[i] = '\0';
+		return srclen;
+	}
 	return locale->ctype->strupper(dst, dstsize, src, srclen, locale);
 }
 
@@ -1282,10 +1327,18 @@ size_t
 pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 		   pg_locale_t locale)
 {
-	if (locale->ctype->strfold)
-		return locale->ctype->strfold(dst, dstsize, src, srclen, locale);
-	else
-		return locale->ctype->strlower(dst, dstsize, src, srclen, locale);
+	if (locale->ctype == NULL)
+	{
+		int			i;
+
+		srclen = (srclen >= 0) ? srclen : strlen(src);
+		for (i = 0; i < srclen && i < dstsize; i++)
+			dst[i] = pg_ascii_tolower(src[i]);
+		if (i < dstsize)
+			dst[i] = '\0';
+		return srclen;
+	}
+	return locale->ctype->strfold(dst, dstsize, src, srclen, locale);
 }
 
 /*
@@ -1560,6 +1613,8 @@ pg_towlower(pg_wchar wc, pg_locale_t locale)
 bool
 char_is_cased(char ch, pg_locale_t locale)
 {
+	if (locale->ctype == NULL)
+		return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
 	return locale->ctype->char_is_cased(ch, locale);
 }
 
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 716f005066a..942454de4ed 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -326,6 +326,7 @@ static const struct ctype_methods ctype_methods_libc_sb = {
 	.strlower = strlower_libc_sb,
 	.strtitle = strtitle_libc_sb,
 	.strupper = strupper_libc_sb,
+	.strfold = strlower_libc_sb,
 	.wc_isdigit = wc_isdigit_libc_sb,
 	.wc_isalpha = wc_isalpha_libc_sb,
 	.wc_isalnum = wc_isalnum_libc_sb,
@@ -351,6 +352,7 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
 	.strlower = strlower_libc_mb,
 	.strtitle = strtitle_libc_mb,
 	.strupper = strupper_libc_mb,
+	.strfold = strlower_libc_mb,
 	.wc_isdigit = wc_isdigit_libc_sb,
 	.wc_isalpha = wc_isalpha_libc_sb,
 	.wc_isalnum = wc_isalnum_libc_sb,
@@ -372,6 +374,7 @@ static const struct ctype_methods ctype_methods_libc_utf8 = {
 	.strlower = strlower_libc_mb,
 	.strtitle = strtitle_libc_mb,
 	.strupper = strupper_libc_mb,
+	.strfold = strlower_libc_mb,
 	.wc_isdigit = wc_isdigit_libc_mb,
 	.wc_isalpha = wc_isalpha_libc_mb,
 	.wc_isalnum = wc_isalnum_libc_mb,
-- 
2.43.0

From 9cc6025640a2fdb5bee4a84598a3fdb352d81954 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Wed, 19 Nov 2025 10:20:36 -0800
Subject: [PATCH v8 5/7] Fix inconsistency between ltree_strncasecmp() and
 ltree_crc32_sz().

Previously, ltree_strncasecmp() used lowercasing with the default
collation; while ltree_crc32_sz used tolower() directly. These were
equivalent only if the default collation provider was libc and the
encoding is single-byte.

Change both to use casefolding with the default collation.
---
 contrib/ltree/crc32.c     | 46 ++++++++++++++++++++++++++++++++-------
 contrib/ltree/lquery_op.c | 31 ++++++++++++++++++++++++--
 2 files changed, 67 insertions(+), 10 deletions(-)

diff --git a/contrib/ltree/crc32.c b/contrib/ltree/crc32.c
index 134f46a805e..3918d4a0ec2 100644
--- a/contrib/ltree/crc32.c
+++ b/contrib/ltree/crc32.c
@@ -10,31 +10,61 @@
 #include "postgres.h"
 #include "ltree.h"
 
+#include "crc32.h"
+#include "utils/pg_crc.h"
 #ifdef LOWER_NODE
-#include <ctype.h>
-#define TOLOWER(x)	tolower((unsigned char) (x))
-#else
-#define TOLOWER(x)	(x)
+#include "utils/pg_locale.h"
 #endif
 
-#include "crc32.h"
-#include "utils/pg_crc.h"
+#ifdef LOWER_NODE
 
 unsigned int
 ltree_crc32_sz(const char *buf, int size)
 {
 	pg_crc32	crc;
 	const char *p = buf;
+	static pg_locale_t locale = NULL;
+
+	if (!locale)
+		locale = pg_database_locale();
 
 	INIT_TRADITIONAL_CRC32(crc);
 	while (size > 0)
 	{
-		char		c = (char) TOLOWER(*p);
+		char		foldstr[UNICODE_CASEMAP_BUFSZ];
+		int			srclen = pg_mblen(p);
+		size_t		foldlen;
+
+		/* fold one codepoint at a time */
+		foldlen = pg_strfold(foldstr, UNICODE_CASEMAP_BUFSZ, p, srclen,
+							 locale);
+
+		COMP_TRADITIONAL_CRC32(crc, foldstr, foldlen);
+
+		size -= srclen;
+		p += srclen;
+	}
+	FIN_TRADITIONAL_CRC32(crc);
+	return (unsigned int) crc;
+}
+
+#else
 
-		COMP_TRADITIONAL_CRC32(crc, &c, 1);
+unsigned int
+ltree_crc32_sz(const char *buf, int size)
+{
+	pg_crc32	crc;
+	const char *p = buf;
+
+	INIT_TRADITIONAL_CRC32(crc);
+	while (size > 0)
+	{
+		COMP_TRADITIONAL_CRC32(crc, p, 1);
 		size--;
 		p++;
 	}
 	FIN_TRADITIONAL_CRC32(crc);
 	return (unsigned int) crc;
 }
+
+#endif							/* !LOWER_NODE */
diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c
index a6466f575fd..d6754eb613f 100644
--- a/contrib/ltree/lquery_op.c
+++ b/contrib/ltree/lquery_op.c
@@ -77,10 +77,37 @@ compare_subnode(ltree_level *t, char *qn, int len, int (*cmpptr) (const char *,
 int
 ltree_strncasecmp(const char *a, const char *b, size_t s)
 {
-	char	   *al = str_tolower(a, s, DEFAULT_COLLATION_OID);
-	char	   *bl = str_tolower(b, s, DEFAULT_COLLATION_OID);
+	static pg_locale_t locale = NULL;
+	size_t		al_sz = s + 1;
+	char	   *al = palloc(al_sz);
+	size_t		bl_sz = s + 1;
+	char	   *bl = palloc(bl_sz);
+	size_t		needed;
 	int			res;
 
+	if (!locale)
+		locale = pg_database_locale();
+
+	needed = pg_strfold(al, al_sz, a, s, locale);
+	if (needed + 1 > al_sz)
+	{
+		/* grow buffer if needed and retry */
+		al_sz = needed + 1;
+		al = repalloc(al, al_sz);
+		needed = pg_strfold(al, al_sz, a, s, locale);
+		Assert(needed + 1 <= al_sz);
+	}
+
+	needed = pg_strfold(bl, bl_sz, b, s, locale);
+	if (needed + 1 > bl_sz)
+	{
+		/* grow buffer if needed and retry */
+		bl_sz = needed + 1;
+		bl = repalloc(bl, bl_sz);
+		needed = pg_strfold(bl, bl_sz, b, s, locale);
+		Assert(needed + 1 <= bl_sz);
+	}
+
 	res = strncmp(al, bl, s);
 
 	pfree(al);
-- 
2.43.0

From 709c38c8a3b992e5ddf2c6d93a838d7ef588c0f9 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Thu, 20 Nov 2025 13:09:26 -0800
Subject: [PATCH v8 6/7] Inline pg_ascii_tolower() and pg_ascii_toupper().

---
 src/include/port.h      | 25 +++++++++++++++++++++++--
 src/port/pgstrcasecmp.c | 26 --------------------------
 2 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/src/include/port.h b/src/include/port.h
index 3964d3b1293..159c2bcd7e3 100644
--- a/src/include/port.h
+++ b/src/include/port.h
@@ -169,8 +169,29 @@ extern int	pg_strcasecmp(const char *s1, const char *s2);
 extern int	pg_strncasecmp(const char *s1, const char *s2, size_t n);
 extern unsigned char pg_toupper(unsigned char ch);
 extern unsigned char pg_tolower(unsigned char ch);
-extern unsigned char pg_ascii_toupper(unsigned char ch);
-extern unsigned char pg_ascii_tolower(unsigned char ch);
+
+/*
+ * Fold a character to upper case, following C/POSIX locale rules.
+ */
+static inline unsigned char
+pg_ascii_toupper(unsigned char ch)
+{
+	if (ch >= 'a' && ch <= 'z')
+		ch += 'A' - 'a';
+	return ch;
+}
+
+/*
+ * Fold a character to lower case, following C/POSIX locale rules.
+ */
+static inline unsigned char
+pg_ascii_tolower(unsigned char ch)
+{
+	if (ch >= 'A' && ch <= 'Z')
+		ch += 'a' - 'A';
+	return ch;
+}
+
 
 /*
  * Beginning in v12, we always replace snprintf() and friends with our own
diff --git a/src/port/pgstrcasecmp.c b/src/port/pgstrcasecmp.c
index ec2b3a75c3d..17e93180381 100644
--- a/src/port/pgstrcasecmp.c
+++ b/src/port/pgstrcasecmp.c
@@ -13,10 +13,6 @@
  *
  * NB: this code should match downcase_truncate_identifier() in scansup.c.
  *
- * We also provide strict ASCII-only case conversion functions, which can
- * be used to implement C/POSIX case folding semantics no matter what the
- * C library thinks the locale is.
- *
  *
  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  *
@@ -127,25 +123,3 @@ pg_tolower(unsigned char ch)
 		ch = tolower(ch);
 	return ch;
 }
-
-/*
- * Fold a character to upper case, following C/POSIX locale rules.
- */
-unsigned char
-pg_ascii_toupper(unsigned char ch)
-{
-	if (ch >= 'a' && ch <= 'z')
-		ch += 'A' - 'a';
-	return ch;
-}
-
-/*
- * Fold a character to lower case, following C/POSIX locale rules.
- */
-unsigned char
-pg_ascii_tolower(unsigned char ch)
-{
-	if (ch >= 'A' && ch <= 'Z')
-		ch += 'a' - 'A';
-	return ch;
-}
-- 
2.43.0

From 81948ecc1e3c4f1b4bd79ecd96ac151e2332f3df Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Wed, 19 Nov 2025 18:16:41 -0800
Subject: [PATCH v8 7/7] Remove char_tolower() API.

It's only useful for an ILIKE optimization for the libc provider using
a single-byte encoding and a non-C locale, but it creates significant
internal complexity.
---
 src/backend/utils/adt/like.c           | 42 +++++++++-----------------
 src/backend/utils/adt/like_match.c     | 18 ++++++-----
 src/backend/utils/adt/pg_locale.c      | 22 --------------
 src/backend/utils/adt/pg_locale_libc.c | 10 ------
 src/include/utils/pg_locale.h          |  9 ------
 5 files changed, 25 insertions(+), 76 deletions(-)

diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index 4216ac17f43..4a7fc583c71 100644
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -43,8 +43,8 @@ static text *MB_do_like_escape(text *pat, text *esc);
 static int	UTF8_MatchText(const char *t, int tlen, const char *p, int plen,
 						   pg_locale_t locale);
 
-static int	SB_IMatchText(const char *t, int tlen, const char *p, int plen,
-						  pg_locale_t locale);
+static int	C_IMatchText(const char *t, int tlen, const char *p, int plen,
+						 pg_locale_t locale);
 
 static int	GenericMatchText(const char *s, int slen, const char *p, int plen, Oid collation);
 static int	Generic_Text_IC_like(text *str, text *pat, Oid collation);
@@ -84,22 +84,10 @@ wchareq(const char *p1, const char *p2)
  * of getting a single character transformed to the system's wchar_t format.
  * So now, we just downcase the strings using lower() and apply regular LIKE
  * comparison.  This should be revisited when we install better locale support.
- */
-
-/*
- * We do handle case-insensitive matching for single-byte encodings using
+ *
+ * We do handle case-insensitive matching for the C locale using
  * fold-on-the-fly processing, however.
  */
-static char
-SB_lower_char(unsigned char c, pg_locale_t locale)
-{
-	if (locale->ctype_is_c)
-		return pg_ascii_tolower(c);
-	else if (locale->is_default)
-		return pg_tolower(c);
-	else
-		return char_tolower(c, locale);
-}
 
 
 #define NextByte(p, plen)	((p)++, (plen)--)
@@ -131,9 +119,9 @@ SB_lower_char(unsigned char c, pg_locale_t locale)
 #include "like_match.c"
 
 /* setup to compile like_match.c for single byte case insensitive matches */
-#define MATCH_LOWER(t, locale) SB_lower_char((unsigned char) (t), locale)
+#define MATCH_LOWER
 #define NextChar(p, plen) NextByte((p), (plen))
-#define MatchText SB_IMatchText
+#define MatchText C_IMatchText
 
 #include "like_match.c"
 
@@ -202,22 +190,17 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)
 				 errmsg("nondeterministic collations are not supported for ILIKE")));
 
 	/*
-	 * For efficiency reasons, in the single byte case we don't call lower()
-	 * on the pattern and text, but instead call SB_lower_char on each
-	 * character.  In the multi-byte case we don't have much choice :-(. Also,
-	 * ICU does not support single-character case folding, so we go the long
-	 * way.
+	 * For efficiency reasons, in the C locale we don't call lower() on the
+	 * pattern and text, but instead call SB_lower_char on each character.
 	 */
 
-	if (locale->ctype_is_c ||
-		(char_tolower_enabled(locale) &&
-		 pg_database_encoding_max_length() == 1))
+	if (locale->ctype_is_c)
 	{
 		p = VARDATA_ANY(pat);
 		plen = VARSIZE_ANY_EXHDR(pat);
 		s = VARDATA_ANY(str);
 		slen = VARSIZE_ANY_EXHDR(str);
-		return SB_IMatchText(s, slen, p, plen, locale);
+		return C_IMatchText(s, slen, p, plen, locale);
 	}
 	else
 	{
@@ -229,10 +212,13 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)
 													 PointerGetDatum(str)));
 		s = VARDATA_ANY(str);
 		slen = VARSIZE_ANY_EXHDR(str);
+
 		if (GetDatabaseEncoding() == PG_UTF8)
 			return UTF8_MatchText(s, slen, p, plen, 0);
-		else
+		else if (pg_database_encoding_max_length() > 1)
 			return MB_MatchText(s, slen, p, plen, 0);
+		else
+			return SB_MatchText(s, slen, p, plen, 0);
 	}
 }
 
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c
index 892f8a745ea..54846c9541d 100644
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -70,10 +70,14 @@
  *--------------------
  */
 
+/*
+ * MATCH_LOWER is defined for ILIKE in the C locale as an optimization. Other
+ * locales must casefold the inputs before matching.
+ */
 #ifdef MATCH_LOWER
-#define GETCHAR(t, locale) MATCH_LOWER(t, locale)
+#define GETCHAR(t) pg_ascii_tolower(t)
 #else
-#define GETCHAR(t, locale) (t)
+#define GETCHAR(t) (t)
 #endif
 
 static int
@@ -105,7 +109,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
 						 errmsg("LIKE pattern must not end with escape character")));
-			if (GETCHAR(*p, locale) != GETCHAR(*t, locale))
+			if (GETCHAR(*p) != GETCHAR(*t))
 				return LIKE_FALSE;
 		}
 		else if (*p == '%')
@@ -167,14 +171,14 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 					ereport(ERROR,
 							(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
 							 errmsg("LIKE pattern must not end with escape character")));
-				firstpat = GETCHAR(p[1], locale);
+				firstpat = GETCHAR(p[1]);
 			}
 			else
-				firstpat = GETCHAR(*p, locale);
+				firstpat = GETCHAR(*p);
 
 			while (tlen > 0)
 			{
-				if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic))
+				if (GETCHAR(*t) == firstpat || (locale && !locale->deterministic))
 				{
 					int			matched = MatchText(t, tlen, p, plen, locale);
 
@@ -342,7 +346,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 					NextChar(t1, t1len);
 			}
 		}
-		else if (GETCHAR(*p, locale) != GETCHAR(*t, locale))
+		else if (GETCHAR(*p) != GETCHAR(*t))
 		{
 			/* non-wildcard pattern char fails to match text char */
 			return LIKE_FALSE;
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 9319fb633b6..b3afa6cad6c 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1618,28 +1618,6 @@ char_is_cased(char ch, pg_locale_t locale)
 	return locale->ctype->char_is_cased(ch, locale);
 }
 
-/*
- * char_tolower_enabled()
- *
- * Does the provider support char_tolower()?
- */
-bool
-char_tolower_enabled(pg_locale_t locale)
-{
-	return (locale->ctype->char_tolower != NULL);
-}
-
-/*
- * char_tolower()
- *
- * Convert char (single-byte encoding) to lowercase.
- */
-char
-char_tolower(unsigned char ch, pg_locale_t locale)
-{
-	return locale->ctype->char_tolower(ch, locale);
-}
-
 /*
  * Return required encoding ID for the given locale, or -1 if any encoding is
  * valid for the locale.
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 942454de4ed..3407e15712b 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -248,13 +248,6 @@ wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
 #endif
 }
 
-static char
-char_tolower_libc(unsigned char ch, pg_locale_t locale)
-{
-	Assert(pg_database_encoding_max_length() == 1);
-	return tolower_l(ch, locale->lt);
-}
-
 static bool
 char_is_cased_libc(char ch, pg_locale_t locale)
 {
@@ -338,7 +331,6 @@ static const struct ctype_methods ctype_methods_libc_sb = {
 	.wc_isspace = wc_isspace_libc_sb,
 	.wc_isxdigit = wc_isxdigit_libc_sb,
 	.char_is_cased = char_is_cased_libc,
-	.char_tolower = char_tolower_libc,
 	.wc_toupper = toupper_libc_sb,
 	.wc_tolower = tolower_libc_sb,
 	.max_chr = UCHAR_MAX,
@@ -364,7 +356,6 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
 	.wc_isspace = wc_isspace_libc_sb,
 	.wc_isxdigit = wc_isxdigit_libc_sb,
 	.char_is_cased = char_is_cased_libc,
-	.char_tolower = char_tolower_libc,
 	.wc_toupper = toupper_libc_sb,
 	.wc_tolower = tolower_libc_sb,
 	.max_chr = UCHAR_MAX,
@@ -386,7 +377,6 @@ static const struct ctype_methods ctype_methods_libc_utf8 = {
 	.wc_isspace = wc_isspace_libc_mb,
 	.wc_isxdigit = wc_isxdigit_libc_mb,
 	.char_is_cased = char_is_cased_libc,
-	.char_tolower = char_tolower_libc,
 	.wc_toupper = toupper_libc_mb,
 	.wc_tolower = tolower_libc_mb,
 };
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 49fd22bf8eb..5e21b517e96 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -125,13 +125,6 @@ struct ctype_methods
 	/* required */
 	bool		(*char_is_cased) (char ch, pg_locale_t locale);
 
-	/*
-	 * Optional. If defined, will only be called for single-byte encodings. If
-	 * not defined, or if the encoding is multibyte, will fall back to
-	 * pg_strlower().
-	 */
-	char		(*char_tolower) (unsigned char ch, pg_locale_t locale);
-
 	/*
 	 * For regex and pattern matching efficiency, the maximum char value
 	 * supported by the above methods. If zero, limit is set by regex code.
@@ -188,8 +181,6 @@ extern pg_locale_t pg_newlocale_from_collation(Oid collid);
 extern char *get_collation_actual_version(char collprovider, const char *collcollate);
 
 extern bool char_is_cased(char ch, pg_locale_t locale);
-extern bool char_tolower_enabled(pg_locale_t locale);
-extern char char_tolower(unsigned char ch, pg_locale_t locale);
 extern size_t pg_strlower(char *dst, size_t dstsize,
 						  const char *src, ssize_t srclen,
 						  pg_locale_t locale);
-- 
2.43.0

Re: Remaining dependency on setlocale()

Reply via email to