downcase_identifier(): use method table from locale provider

Jeff Davis Tue, 21 Oct 2025 11:28:29 -0700

The attached patch refactors downcase_identifier() to use a method from
the locale provider.


The main advantage is that we can bring the tolower() call into the
libc provider, and make it tolower_l() to avoid the global LC_CTYPE
dependency. It's also generally aligned with the idea that provider-
specific behavior should be defined by the provider rather than the
caller, and might enable us in the future to improve support for
Unicode-aware identifier case folding.

Unfortunately, ICU also currently uses tolower() for single-byte
encodings, which seems to have been a historical oversight. This patch
doesn't correct that, which can be done in a separate patch.

There's a theoretical behavior change if downcase_identifier is called
before the database default locale is initialized, but I don't see a
practical problem there. Other than that, there should be no behavior
changes.

Regards,
        Jeff Davis

From 215dc350ae468f160aaa2cb014858726f288457f Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 20 Oct 2025 16:32:18 -0700
Subject: [PATCH v1] downcase_identifier(): use method table from locale
 provider.

Refactor to allow each provider to supply its own implementation of
identifier casefolding.

When the database default locale is libc, use tolower_l() instead of
tolower() to remove its dependency on the global LC_CTYPE setting.

When the database default locale is ICU, and the encoding is
single-byte, tolower() is still required to preserve historical
behavior. We should consider fixing this separately.

One minor behavior change is that, before the database default locale
is initialized, it uses ASCII semantics to fold the
identifiers. Previously, it would use the postmaster's LC_CTYPE
setting from the environment. While that could have some effect during
GUC processing, for example, it would have been fragile to rely on the
environment setting anyway. (Also, it only matters when the encoding
is single-byte.)
---
 src/backend/parser/scansup.c              | 39 +++++++-----------
 src/backend/utils/adt/pg_locale.c         | 37 +++++++++++++++++
 src/backend/utils/adt/pg_locale_builtin.c | 24 ++++++++++++
 src/backend/utils/adt/pg_locale_icu.c     | 32 +++++++++++++++
 src/backend/utils/adt/pg_locale_libc.c    | 48 +++++++++++++++++++++++
 src/include/utils/pg_locale.h             |  6 +++
 6 files changed, 162 insertions(+), 24 deletions(-)

diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
index 2feb2b6cf5a..0bd049643d1 100644
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -18,6 +18,7 @@
 
 #include "mb/pg_wchar.h"
 #include "parser/scansup.h"
+#include "utils/pg_locale.h"
 
 
 /*
@@ -46,35 +47,25 @@ char *
 downcase_identifier(const char *ident, int len, bool warn, bool truncate)
 {
 	char	   *result;
-	int			i;
-	bool		enc_is_single_byte;
-
-	result = palloc(len + 1);
-	enc_is_single_byte = pg_database_encoding_max_length() == 1;
+	size_t		dstsize;
+	size_t		needed pg_attribute_unused();
 
 	/*
-	 * SQL99 specifies Unicode-aware case normalization, which we don't yet
-	 * have the infrastructure for.  Instead we use tolower() to provide a
-	 * locale-aware translation.  However, there are some locales where this
-	 * is not right either (eg, Turkish may do strange things with 'i' and
-	 * 'I').  Our current compromise is to use tolower() for characters with
-	 * the high bit set, as long as they aren't part of a multi-byte
-	 * character, and use an ASCII-only downcasing for 7-bit characters.
+	 * Preserves string length.
+	 *
+	 * NB: if we decide to support Unicode-aware identifier case folding, then
+	 * we need to account for a change in string length.
 	 */
-	for (i = 0; i < len; i++)
-	{
-		unsigned char ch = (unsigned char) ident[i];
+	dstsize = len + 1;
+	result = palloc(dstsize);
 
-		if (ch >= 'A' && ch <= 'Z')
-			ch += 'a' - 'A';
-		else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
-			ch = tolower(ch);
-		result[i] = (char) ch;
-	}
-	result[i] = '\0';
+	needed = pg_strfold_ident(result, dstsize, ident, len);
+	Assert(needed + 1 == dstsize);
+	Assert(needed == len);
+	Assert(result[len] == '\0');
 
-	if (i >= NAMEDATALEN && truncate)
-		truncate_identifier(result, i, warn);
+	if (len >= NAMEDATALEN && truncate)
+		truncate_identifier(result, len, warn);
 
 	return result;
 }
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 67299c55ed8..f45c6a76064 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1275,6 +1275,43 @@ pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 		return locale->ctype->strlower(dst, dstsize, src, srclen, locale);
 }
 
+/*
+ * Fold an identifier using the database default locale.
+ *
+ * 1. A..Z always fold to a..z, even in the Turkish locale.
+ * 2. Single-byte encodings fold characters beyond 127 using libc tolower(),
+ *    even if the locale provider is ICU.
+ * 3. Multi-byte encodings only fold A..Z.
+ *
+ * XXX: for historical reasons, does not use ordinary locale behavior. Should
+ * only be used for identifier folding. Can we make this equivalent to
+ * pg_strfold(..., default_locale)?
+ */
+size_t
+pg_strfold_ident(char *dest, size_t destsize, const char *src, ssize_t srclen)
+{
+	if (default_locale == NULL || default_locale->ctype == NULL)
+	{
+		int			i;
+
+		for (i = 0; i < srclen && i < destsize; i++)
+		{
+			unsigned char ch = (unsigned char) src[i];
+
+			if (ch >= 'A' && ch <= 'Z')
+				ch += 'a' - 'A';
+			dest[i] = (char) ch;
+		}
+
+		if (i < destsize)
+			dest[i] = '\0';
+
+		return srclen;
+	}
+	return default_locale->ctype->strfold_ident(dest, destsize, src, srclen,
+												default_locale);
+}
+
 /*
  * pg_strcoll
  *
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 3dc611b50e1..c51594c4178 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -109,6 +109,29 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 						   locale->builtin.casemap_full);
 }
 
+static size_t
+strfold_ident_builtin(char *dst, size_t dstsize, const char *src,
+					  ssize_t srclen, pg_locale_t locale)
+{
+	int			i;
+
+	Assert(GetDatabaseEncoding() == PG_UTF8);
+
+	for (i = 0; i < srclen && i < dstsize; i++)
+	{
+		unsigned char ch = (unsigned char) src[i];
+
+		if (ch >= 'A' && ch <= 'Z')
+			ch += 'a' - 'A';
+		dst[i] = (char) ch;
+	}
+
+	if (i < dstsize)
+		dst[i] = '\0';
+
+	return srclen;
+}
+
 static bool
 wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale)
 {
@@ -203,6 +226,7 @@ static const struct ctype_methods ctype_methods_builtin = {
 	.wc_ispunct = wc_ispunct_builtin,
 	.wc_isspace = wc_isspace_builtin,
 	.wc_isxdigit = wc_isxdigit_builtin,
+	.strfold_ident = strfold_ident_builtin,
 	.char_is_cased = char_is_cased_builtin,
 	.wc_tolower = wc_tolower_builtin,
 	.wc_toupper = wc_toupper_builtin,
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index 05bad202669..42b15cf0955 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -218,6 +218,37 @@ wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale)
 	return u_isxdigit(wc);
 }
 
+/*
+ * Historically, tolower() was used for identifier casefolding even when the
+ * provider was ICU. We preserve that behavior here, but it creates an awkward
+ * dependency on the global LC_CTYPE. The pg_locale_t object has no internal
+ * libc locale_t object, so we can't use tolower_l().
+ */
+static size_t
+strfold_ident_icu(char *dst, size_t dstsize, const char *src,
+				  ssize_t srclen, pg_locale_t locale)
+{
+	int			i;
+	bool		enc_is_single_byte;
+
+	enc_is_single_byte = pg_database_encoding_max_length() == 1;
+	for (i = 0; i < srclen && i < dstsize; i++)
+	{
+		unsigned char ch = (unsigned char) src[i];
+
+		if (ch >= 'A' && ch <= 'Z')
+			ch += 'a' - 'A';
+		else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
+			ch = tolower(ch);
+		dst[i] = (char) ch;
+	}
+
+	if (i < dstsize)
+		dst[i] = '\0';
+
+	return srclen;
+}
+
 static const struct ctype_methods ctype_methods_icu = {
 	.strlower = strlower_icu,
 	.strtitle = strtitle_icu,
@@ -233,6 +264,7 @@ static const struct ctype_methods ctype_methods_icu = {
 	.wc_ispunct = wc_ispunct_icu,
 	.wc_isspace = wc_isspace_icu,
 	.wc_isxdigit = wc_isxdigit_icu,
+	.strfold_ident = strfold_ident_icu,
 	.char_is_cased = char_is_cased_icu,
 	.wc_toupper = toupper_icu,
 	.wc_tolower = tolower_icu,
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 19a50662398..24a5e46d718 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -323,6 +323,51 @@ tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
 		return wc;
 }
 
+static size_t
+strfold_ident_libc_sb(char *dst, size_t dstsize, const char *src,
+					  ssize_t srclen, pg_locale_t locale)
+{
+	locale_t	loc = locale->lt;
+	int			i;
+
+	for (i = 0; i < srclen && i < dstsize; i++)
+	{
+		unsigned char ch = (unsigned char) src[i];
+
+		if (ch >= 'A' && ch <= 'Z')
+			ch += 'a' - 'A';
+		else if (IS_HIGHBIT_SET(ch) && isupper_l(ch, loc))
+			ch = tolower_l(ch, loc);
+		dst[i] = (char) ch;
+	}
+
+	if (i < dstsize)
+		dst[i] = '\0';
+
+	return srclen;
+}
+
+static size_t
+strfold_ident_libc_mb(char *dst, size_t dstsize, const char *src,
+					  ssize_t srclen, pg_locale_t locale)
+{
+	int			i;
+
+	for (i = 0; i < srclen && i < dstsize; i++)
+	{
+		unsigned char ch = (unsigned char) src[i];
+
+		if (ch >= 'A' && ch <= 'Z')
+			ch += 'a' - 'A';
+		dst[i] = (char) ch;
+	}
+
+	if (i < dstsize)
+		dst[i] = '\0';
+
+	return srclen;
+}
+
 static const struct ctype_methods ctype_methods_libc_sb = {
 	.strlower = strlower_libc_sb,
 	.strtitle = strtitle_libc_sb,
@@ -337,6 +382,7 @@ static const struct ctype_methods ctype_methods_libc_sb = {
 	.wc_ispunct = wc_ispunct_libc_sb,
 	.wc_isspace = wc_isspace_libc_sb,
 	.wc_isxdigit = wc_isxdigit_libc_sb,
+	.strfold_ident = strfold_ident_libc_sb,
 	.char_is_cased = char_is_cased_libc,
 	.char_tolower = char_tolower_libc,
 	.wc_toupper = toupper_libc_sb,
@@ -362,6 +408,7 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
 	.wc_ispunct = wc_ispunct_libc_sb,
 	.wc_isspace = wc_isspace_libc_sb,
 	.wc_isxdigit = wc_isxdigit_libc_sb,
+	.strfold_ident = strfold_ident_libc_mb,
 	.char_is_cased = char_is_cased_libc,
 	.char_tolower = char_tolower_libc,
 	.wc_toupper = toupper_libc_sb,
@@ -383,6 +430,7 @@ static const struct ctype_methods ctype_methods_libc_utf8 = {
 	.wc_ispunct = wc_ispunct_libc_mb,
 	.wc_isspace = wc_isspace_libc_mb,
 	.wc_isxdigit = wc_isxdigit_libc_mb,
+	.strfold_ident = strfold_ident_libc_mb,
 	.char_is_cased = char_is_cased_libc,
 	.char_tolower = char_tolower_libc,
 	.wc_toupper = toupper_libc_mb,
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index e08cb8228fa..97ffaf745fb 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -113,6 +113,10 @@ struct ctype_methods
 
 	/* required */
 	bool		(*char_is_cased) (char ch, pg_locale_t locale);
+	size_t		(*strfold_ident) (char *dest, size_t destsize,
+								  const char *src, ssize_t srclen,
+								  pg_locale_t locale);
+
 
 	/*
 	 * Optional. If defined, will only be called for single-byte encodings. If
@@ -191,6 +195,8 @@ extern size_t pg_strupper(char *dst, size_t dstsize,
 extern size_t pg_strfold(char *dst, size_t dstsize,
 						 const char *src, ssize_t srclen,
 						 pg_locale_t locale);
+extern size_t pg_strfold_ident(char *dst, size_t dstsize,
+							   const char *src, ssize_t srclen);
 extern int	pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
 extern int	pg_strncoll(const char *arg1, ssize_t len1,
 						const char *arg2, ssize_t len2, pg_locale_t locale);
-- 
2.43.0

downcase_identifier(): use method table from locale provider

Reply via email to