Can we make pg_strcasecmp(), pg_tolower(), pg_toupper() plain ASCII semantics?

Jeff Davis Mon, 20 Oct 2025 14:03:10 -0700

pg_strcasecmp(), etc., have a dependency on LC_CTYPE, which means a
dependency on setlocale(). I'd like to eliminate those dependencies in
the backend because they cause significant annoyance, especially when
using non-libc providers.


Right now, these functions are effectively very close to plain-ascii
semantics. If the character is in ASCII range, then it only folds
characters A..Z. If using a multibyte encoding, any other byte is part
of a multibyte sequence, so the behavior of tolower() is undefined, and
I believe usually returns 0.

So the only time tolower() matters is when using a single-byte encoding
and folding a character outside the ASCII range.

Most of the callers seem to use these functions in a context that only
cares about ASCII, anyway.

There are a few callers where it matters, such as the implementations
of UPPER()/LOWER()/INITCAP() and LIKE. Those already need special
cases, so it's easy to inline them and make use of the pg_locale_t
object, thus avoiding the dependency on the global LC_CTYPE.

There's a comment at the top of the file saying:

  NB: this code should match downcase_truncate_identifier() in
scansup.c.

but I don't see call sites where that's likely to matter. I'd like to
do something about downcase_identifier() as well, but that has more
serious compatibility issues if someone is affected, so needs a bit
more care. Also, given that downcase_identifier checks for a single
byte encoding and these other functions do not, I don't think there's
any guarantee that they are identical in behavior.

While I can imagine that the tolower() call may have been useful at one
time, the fact that it doesn't work for UTF-8 makes me think it's not
widely relied-upon.

Am I missing something? Perhaps it matters for code outside the
backend? 

Attached is a patch to remove the tolower() calls from pgstrcasecmp.c,
and fix up the few call sites where it's needed.

Regards,
        Jeff Davis

From 2d311174a4d822962f09a7feb5c90d275b212345 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 20 Oct 2025 11:40:36 -0700
Subject: [PATCH v1] Remove tolower() call from pgstrcasecmp.c functions.

Eliminate these functions' dependency on the global LC_CTYPE. This
never worked for multi-byte encodings, and most callers only care
about ASCII.

For callers where it does matter, inline the pg_tolower()/pg_toupper()
calls, and use the available pg_locale_t object to avoid dependency on
the global LC_CTYPE.
---
 src/backend/utils/adt/like.c           |  9 ++++++-
 src/backend/utils/adt/pg_locale_libc.c | 32 +++++++++++++++++++----
 src/port/pgstrcasecmp.c                | 35 ++++++--------------------
 3 files changed, 42 insertions(+), 34 deletions(-)

diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index 4216ac17f43..bb53ee8d09f 100644
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -96,7 +96,14 @@ SB_lower_char(unsigned char c, pg_locale_t locale)
 	if (locale->ctype_is_c)
 		return pg_ascii_tolower(c);
 	else if (locale->is_default)
-		return pg_tolower(c);
+	{
+		if (c >= 'A' && c <= 'Z')
+			return c + ('a' - 'A');
+		else if (IS_HIGHBIT_SET(c))
+			return char_tolower(c, locale);
+		else
+			return c;
+	}
 	else
 		return char_tolower(c, locale);
 }
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 34865ccf00e..e60479ef2eb 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -250,7 +250,9 @@ static char
 char_tolower_libc(unsigned char ch, pg_locale_t locale)
 {
 	Assert(pg_database_encoding_max_length() == 1);
-	return tolower_l(ch, locale->lt);
+	if (isupper_l(ch, locale->lt))
+		return tolower_l(ch, locale->lt);
+	return ch;
 }
 
 static bool
@@ -448,7 +450,12 @@ strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
 		for (p = dest; *p; p++)
 		{
 			if (locale->is_default)
-				*p = pg_tolower((unsigned char) *p);
+			{
+				if (*p >= 'A' && *p <= 'Z')
+					*p += 'a' - 'A';
+				else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
+					*p = tolower_l((unsigned char) *p, loc);
+			}
 			else
 				*p = tolower_l((unsigned char) *p, loc);
 		}
@@ -533,9 +540,19 @@ strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
 			if (locale->is_default)
 			{
 				if (wasalnum)
-					*p = pg_tolower((unsigned char) *p);
+				{
+					if (*p >= 'A' && *p <= 'Z')
+						*p += 'a' - 'A';
+					else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
+						*p = tolower_l((unsigned char) *p, loc);
+				}
 				else
-					*p = pg_toupper((unsigned char) *p);
+				{
+					if (*p >= 'a' && *p <= 'z')
+						*p += 'A' - 'a';
+					else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
+						*p = toupper_l((unsigned char) *p, loc);
+				}
 			}
 			else
 			{
@@ -631,7 +648,12 @@ strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
 		for (p = dest; *p; p++)
 		{
 			if (locale->is_default)
-				*p = pg_toupper((unsigned char) *p);
+			{
+				if (*p >= 'a' && *p <= 'z')
+					*p += 'A' - 'a';
+				else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
+					*p = toupper_l((unsigned char) *p, loc);
+			}
 			else
 				*p = toupper_l((unsigned char) *p, loc);
 		}
diff --git a/src/port/pgstrcasecmp.c b/src/port/pgstrcasecmp.c
index ec2b3a75c3d..9aab1f41e61 100644
--- a/src/port/pgstrcasecmp.c
+++ b/src/port/pgstrcasecmp.c
@@ -1,23 +1,16 @@
 /*-------------------------------------------------------------------------
  *
  * pgstrcasecmp.c
- *	   Portable SQL-like case-independent comparisons and conversions.
+ *	   Portable case-independent comparisons and conversions.
  *
- * SQL99 specifies Unicode-aware case normalization, which we don't yet
- * have the infrastructure for.  Instead we use tolower() to provide a
- * locale-aware translation.  However, there are some locales where this
- * is not right either (eg, Turkish may do strange things with 'i' and
- * 'I').  Our current compromise is to use tolower() for characters with
- * the high bit set, and use an ASCII-only downcasing for 7-bit
- * characters.
+ * SQL99 specifies Unicode-aware case normalization, but for historical
+ * reasons this was never fully supported. Just uses ASCII-only case
+ * conversion.
  *
- * NB: this code should match downcase_truncate_identifier() in scansup.c.
- *
- * We also provide strict ASCII-only case conversion functions, which can
+ * We also provide explcit ASCII-only case conversion functions, which can
  * be used to implement C/POSIX case folding semantics no matter what the
  * C library thinks the locale is.
  *
- *
  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  *
  * src/port/pgstrcasecmp.c
@@ -44,13 +37,9 @@ pg_strcasecmp(const char *s1, const char *s2)
 		{
 			if (ch1 >= 'A' && ch1 <= 'Z')
 				ch1 += 'a' - 'A';
-			else if (IS_HIGHBIT_SET(ch1) && isupper(ch1))
-				ch1 = tolower(ch1);
 
 			if (ch2 >= 'A' && ch2 <= 'Z')
 				ch2 += 'a' - 'A';
-			else if (IS_HIGHBIT_SET(ch2) && isupper(ch2))
-				ch2 = tolower(ch2);
 
 			if (ch1 != ch2)
 				return (int) ch1 - (int) ch2;
@@ -77,13 +66,9 @@ pg_strncasecmp(const char *s1, const char *s2, size_t n)
 		{
 			if (ch1 >= 'A' && ch1 <= 'Z')
 				ch1 += 'a' - 'A';
-			else if (IS_HIGHBIT_SET(ch1) && isupper(ch1))
-				ch1 = tolower(ch1);
 
 			if (ch2 >= 'A' && ch2 <= 'Z')
 				ch2 += 'a' - 'A';
-			else if (IS_HIGHBIT_SET(ch2) && isupper(ch2))
-				ch2 = tolower(ch2);
 
 			if (ch1 != ch2)
 				return (int) ch1 - (int) ch2;
@@ -98,16 +83,13 @@ pg_strncasecmp(const char *s1, const char *s2, size_t n)
  * Fold a character to upper case.
  *
  * Unlike some versions of toupper(), this is safe to apply to characters
- * that aren't lower case letters.  Note however that the whole thing is
- * a bit bogus for multibyte character sets.
+ * that aren't lower case letters.
  */
 unsigned char
 pg_toupper(unsigned char ch)
 {
 	if (ch >= 'a' && ch <= 'z')
 		ch += 'A' - 'a';
-	else if (IS_HIGHBIT_SET(ch) && islower(ch))
-		ch = toupper(ch);
 	return ch;
 }
 
@@ -115,16 +97,13 @@ pg_toupper(unsigned char ch)
  * Fold a character to lower case.
  *
  * Unlike some versions of tolower(), this is safe to apply to characters
- * that aren't upper case letters.  Note however that the whole thing is
- * a bit bogus for multibyte character sets.
+ * that aren't upper case letters.
  */
 unsigned char
 pg_tolower(unsigned char ch)
 {
 	if (ch >= 'A' && ch <= 'Z')
 		ch += 'a' - 'A';
-	else if (IS_HIGHBIT_SET(ch) && isupper(ch))
-		ch = tolower(ch);
 	return ch;
 }
 
-- 
2.43.0

Can we make pg_strcasecmp(), pg_tolower(), pg_toupper() plain ASCII semantics?

Reply via email to