pg_strcasecmp(), etc., have a dependency on LC_CTYPE, which means a
dependency on setlocale(). I'd like to eliminate those dependencies in
the backend because they cause significant annoyance, especially when
using non-libc providers.
Right now, these functions are effectively very close to plain-ascii
semantics. If the character is in ASCII range, then it only folds
characters A..Z. If using a multibyte encoding, any other byte is part
of a multibyte sequence, so the behavior of tolower() is undefined, and
I believe usually returns 0.
So the only time tolower() matters is when using a single-byte encoding
and folding a character outside the ASCII range.
Most of the callers seem to use these functions in a context that only
cares about ASCII, anyway.
There are a few callers where it matters, such as the implementations
of UPPER()/LOWER()/INITCAP() and LIKE. Those already need special
cases, so it's easy to inline them and make use of the pg_locale_t
object, thus avoiding the dependency on the global LC_CTYPE.
There's a comment at the top of the file saying:
NB: this code should match downcase_truncate_identifier() in
scansup.c.
but I don't see call sites where that's likely to matter. I'd like to
do something about downcase_identifier() as well, but that has more
serious compatibility issues if someone is affected, so needs a bit
more care. Also, given that downcase_identifier checks for a single
byte encoding and these other functions do not, I don't think there's
any guarantee that they are identical in behavior.
While I can imagine that the tolower() call may have been useful at one
time, the fact that it doesn't work for UTF-8 makes me think it's not
widely relied-upon.
Am I missing something? Perhaps it matters for code outside the
backend?
Attached is a patch to remove the tolower() calls from pgstrcasecmp.c,
and fix up the few call sites where it's needed.
Regards,
Jeff Davis
From 2d311174a4d822962f09a7feb5c90d275b212345 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 20 Oct 2025 11:40:36 -0700
Subject: [PATCH v1] Remove tolower() call from pgstrcasecmp.c functions.
Eliminate these functions' dependency on the global LC_CTYPE. This
never worked for multi-byte encodings, and most callers only care
about ASCII.
For callers where it does matter, inline the pg_tolower()/pg_toupper()
calls, and use the available pg_locale_t object to avoid dependency on
the global LC_CTYPE.
---
src/backend/utils/adt/like.c | 9 ++++++-
src/backend/utils/adt/pg_locale_libc.c | 32 +++++++++++++++++++----
src/port/pgstrcasecmp.c | 35 ++++++--------------------
3 files changed, 42 insertions(+), 34 deletions(-)
diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index 4216ac17f43..bb53ee8d09f 100644
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -96,7 +96,14 @@ SB_lower_char(unsigned char c, pg_locale_t locale)
if (locale->ctype_is_c)
return pg_ascii_tolower(c);
else if (locale->is_default)
- return pg_tolower(c);
+ {
+ if (c >= 'A' && c <= 'Z')
+ return c + ('a' - 'A');
+ else if (IS_HIGHBIT_SET(c))
+ return char_tolower(c, locale);
+ else
+ return c;
+ }
else
return char_tolower(c, locale);
}
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 34865ccf00e..e60479ef2eb 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -250,7 +250,9 @@ static char
char_tolower_libc(unsigned char ch, pg_locale_t locale)
{
Assert(pg_database_encoding_max_length() == 1);
- return tolower_l(ch, locale->lt);
+ if (isupper_l(ch, locale->lt))
+ return tolower_l(ch, locale->lt);
+ return ch;
}
static bool
@@ -448,7 +450,12 @@ strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
for (p = dest; *p; p++)
{
if (locale->is_default)
- *p = pg_tolower((unsigned char) *p);
+ {
+ if (*p >= 'A' && *p <= 'Z')
+ *p += 'a' - 'A';
+ else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
+ *p = tolower_l((unsigned char) *p, loc);
+ }
else
*p = tolower_l((unsigned char) *p, loc);
}
@@ -533,9 +540,19 @@ strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
if (locale->is_default)
{
if (wasalnum)
- *p = pg_tolower((unsigned char) *p);
+ {
+ if (*p >= 'A' && *p <= 'Z')
+ *p += 'a' - 'A';
+ else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
+ *p = tolower_l((unsigned char) *p, loc);
+ }
else
- *p = pg_toupper((unsigned char) *p);
+ {
+ if (*p >= 'a' && *p <= 'z')
+ *p += 'A' - 'a';
+ else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
+ *p = toupper_l((unsigned char) *p, loc);
+ }
}
else
{
@@ -631,7 +648,12 @@ strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
for (p = dest; *p; p++)
{
if (locale->is_default)
- *p = pg_toupper((unsigned char) *p);
+ {
+ if (*p >= 'a' && *p <= 'z')
+ *p += 'A' - 'a';
+ else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
+ *p = toupper_l((unsigned char) *p, loc);
+ }
else
*p = toupper_l((unsigned char) *p, loc);
}
diff --git a/src/port/pgstrcasecmp.c b/src/port/pgstrcasecmp.c
index ec2b3a75c3d..9aab1f41e61 100644
--- a/src/port/pgstrcasecmp.c
+++ b/src/port/pgstrcasecmp.c
@@ -1,23 +1,16 @@
/*-------------------------------------------------------------------------
*
* pgstrcasecmp.c
- * Portable SQL-like case-independent comparisons and conversions.
+ * Portable case-independent comparisons and conversions.
*
- * SQL99 specifies Unicode-aware case normalization, which we don't yet
- * have the infrastructure for. Instead we use tolower() to provide a
- * locale-aware translation. However, there are some locales where this
- * is not right either (eg, Turkish may do strange things with 'i' and
- * 'I'). Our current compromise is to use tolower() for characters with
- * the high bit set, and use an ASCII-only downcasing for 7-bit
- * characters.
+ * SQL99 specifies Unicode-aware case normalization, but for historical
+ * reasons this was never fully supported. Just uses ASCII-only case
+ * conversion.
*
- * NB: this code should match downcase_truncate_identifier() in scansup.c.
- *
- * We also provide strict ASCII-only case conversion functions, which can
+ * We also provide explcit ASCII-only case conversion functions, which can
* be used to implement C/POSIX case folding semantics no matter what the
* C library thinks the locale is.
*
- *
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
*
* src/port/pgstrcasecmp.c
@@ -44,13 +37,9 @@ pg_strcasecmp(const char *s1, const char *s2)
{
if (ch1 >= 'A' && ch1 <= 'Z')
ch1 += 'a' - 'A';
- else if (IS_HIGHBIT_SET(ch1) && isupper(ch1))
- ch1 = tolower(ch1);
if (ch2 >= 'A' && ch2 <= 'Z')
ch2 += 'a' - 'A';
- else if (IS_HIGHBIT_SET(ch2) && isupper(ch2))
- ch2 = tolower(ch2);
if (ch1 != ch2)
return (int) ch1 - (int) ch2;
@@ -77,13 +66,9 @@ pg_strncasecmp(const char *s1, const char *s2, size_t n)
{
if (ch1 >= 'A' && ch1 <= 'Z')
ch1 += 'a' - 'A';
- else if (IS_HIGHBIT_SET(ch1) && isupper(ch1))
- ch1 = tolower(ch1);
if (ch2 >= 'A' && ch2 <= 'Z')
ch2 += 'a' - 'A';
- else if (IS_HIGHBIT_SET(ch2) && isupper(ch2))
- ch2 = tolower(ch2);
if (ch1 != ch2)
return (int) ch1 - (int) ch2;
@@ -98,16 +83,13 @@ pg_strncasecmp(const char *s1, const char *s2, size_t n)
* Fold a character to upper case.
*
* Unlike some versions of toupper(), this is safe to apply to characters
- * that aren't lower case letters. Note however that the whole thing is
- * a bit bogus for multibyte character sets.
+ * that aren't lower case letters.
*/
unsigned char
pg_toupper(unsigned char ch)
{
if (ch >= 'a' && ch <= 'z')
ch += 'A' - 'a';
- else if (IS_HIGHBIT_SET(ch) && islower(ch))
- ch = toupper(ch);
return ch;
}
@@ -115,16 +97,13 @@ pg_toupper(unsigned char ch)
* Fold a character to lower case.
*
* Unlike some versions of tolower(), this is safe to apply to characters
- * that aren't upper case letters. Note however that the whole thing is
- * a bit bogus for multibyte character sets.
+ * that aren't upper case letters.
*/
unsigned char
pg_tolower(unsigned char ch)
{
if (ch >= 'A' && ch <= 'Z')
ch += 'a' - 'A';
- else if (IS_HIGHBIT_SET(ch) && isupper(ch))
- ch = tolower(ch);
return ch;
}
--
2.43.0