On Wed, 2025-11-12 at 19:59 +0100, Peter Eisentraut wrote:
> Many of these issues are pre-existing, but I just figured it has
> reached
> a point where we need to do something about it.
I tried to simplify things in this patch series, assuming that we have
some tolerance for small behavior changes.
0001: No behavior change here, same patch as before. Uncontroversial
simplification, so I plan to commit this soon.
0002: change fuzzystrmatch to use ASCII semantics. As far as I can
tell, this only affects the results of soundex(). Before the patch, in
en_US.iso885915, soundex('réd') was 'RÉ30', after the patch it's
'Ré30'. I'm not sure whether the current behavior is intentional or
not. Other functions (daitch_mokotoff, levenshtein, and metaphone) are
unaffected as far as I can tell.
0003+0005: change ltree to use case folding instead of tolower(). I
believe this is a bug fix, because the current code is inconsistent
between ltree_strncasecmp() and ltree_crc32_sz().
0006-0007: Remove char_tolower() API. This also removes the
optimization for single-byte encodings with the libc provider and a
non-C locale, but simplifies the code (the optimization is retained for
the C locale). It's possible to make the lazy-folding optimization work
for all locales without the char_tolower() API by doing something
simlar to what 0004 does for ltree. But to make this work efficiently
for Generic_Text_IC_like() would be a bit more complex: we'd need to
adjust MatchText() to be able to fold the arguments lazily, and perhaps
introduce some kind of casemapping iterator. That's already a pretty
complex function, so I'm hesitant to do that work unless the
optimization is important.
These patches don't get us quite to the point of eliminating the
LC_CTYPE dependency (there's still downcase_identifier() and
pg_strcasecmp() to worry about, and some assorted isxyz() calls to
examine), but they simplify things enough that the path forward will be
easier.
Regards,
Jeff Davis
From 82ef752da7f25d0d718f98ef74748a3b3555d1df Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Sun, 26 Oct 2025 14:58:02 -0700
Subject: [PATCH v8 1/7] Avoid global LC_CTYPE dependency in pg_locale_libc.c.
Call tolower_l() directly instead of through pg_tolower(), because the
latter depends on the global LC_CTYPE.
---
src/backend/utils/adt/pg_locale_libc.c | 28 ++++++++++++++++++++++----
1 file changed, 24 insertions(+), 4 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 9c7fcd1fc7a..716f005066a 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -450,7 +450,12 @@ strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
for (p = dest; *p; p++)
{
if (locale->is_default)
- *p = pg_tolower((unsigned char) *p);
+ {
+ if (*p >= 'A' && *p <= 'Z')
+ *p += 'a' - 'A';
+ else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
+ *p = tolower_l((unsigned char) *p, loc);
+ }
else
*p = tolower_l((unsigned char) *p, loc);
}
@@ -535,9 +540,19 @@ strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
if (locale->is_default)
{
if (wasalnum)
- *p = pg_tolower((unsigned char) *p);
+ {
+ if (*p >= 'A' && *p <= 'Z')
+ *p += 'a' - 'A';
+ else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
+ *p = tolower_l((unsigned char) *p, loc);
+ }
else
- *p = pg_toupper((unsigned char) *p);
+ {
+ if (*p >= 'a' && *p <= 'z')
+ *p -= 'a' - 'A';
+ else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
+ *p = toupper_l((unsigned char) *p, loc);
+ }
}
else
{
@@ -633,7 +648,12 @@ strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
for (p = dest; *p; p++)
{
if (locale->is_default)
- *p = pg_toupper((unsigned char) *p);
+ {
+ if (*p >= 'a' && *p <= 'z')
+ *p -= 'a' - 'A';
+ else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
+ *p = toupper_l((unsigned char) *p, loc);
+ }
else
*p = toupper_l((unsigned char) *p, loc);
}
--
2.43.0
From 09b3be1438da3561562042b86985439f7a206bf1 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Wed, 19 Nov 2025 13:24:38 -0800
Subject: [PATCH v8 2/7] fuzzystrmatch: use pg_ascii_toupper().
fuzzystrmatch is designed for ASCII, so no need to rely on the global
LC_CTYPE setting.
---
contrib/fuzzystrmatch/dmetaphone.c | 2 +-
contrib/fuzzystrmatch/fuzzystrmatch.c | 16 ++++++++--------
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/contrib/fuzzystrmatch/dmetaphone.c b/contrib/fuzzystrmatch/dmetaphone.c
index 6627b2b8943..bb5d3e90756 100644
--- a/contrib/fuzzystrmatch/dmetaphone.c
+++ b/contrib/fuzzystrmatch/dmetaphone.c
@@ -284,7 +284,7 @@ MakeUpper(metastring *s)
char *i;
for (i = s->str; *i; i++)
- *i = toupper((unsigned char) *i);
+ *i = pg_ascii_toupper((unsigned char) *i);
}
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c
index e7cc314b763..7f07efc2c35 100644
--- a/contrib/fuzzystrmatch/fuzzystrmatch.c
+++ b/contrib/fuzzystrmatch/fuzzystrmatch.c
@@ -62,7 +62,7 @@ static const char *const soundex_table = "01230120022455012623010202";
static char
soundex_code(char letter)
{
- letter = toupper((unsigned char) letter);
+ letter = pg_ascii_toupper((unsigned char) letter);
/* Defend against non-ASCII letters */
if (letter >= 'A' && letter <= 'Z')
return soundex_table[letter - 'A'];
@@ -124,7 +124,7 @@ getcode(char c)
{
if (isalpha((unsigned char) c))
{
- c = toupper((unsigned char) c);
+ c = pg_ascii_toupper((unsigned char) c);
/* Defend against non-ASCII letters */
if (c >= 'A' && c <= 'Z')
return _codes[c - 'A'];
@@ -301,18 +301,18 @@ metaphone(PG_FUNCTION_ARGS)
* accessing the array directly... */
/* Look at the next letter in the word */
-#define Next_Letter (toupper((unsigned char) word[w_idx+1]))
+#define Next_Letter (pg_ascii_toupper((unsigned char) word[w_idx+1]))
/* Look at the current letter in the word */
-#define Curr_Letter (toupper((unsigned char) word[w_idx]))
+#define Curr_Letter (pg_ascii_toupper((unsigned char) word[w_idx]))
/* Go N letters back. */
#define Look_Back_Letter(n) \
- (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
+ (w_idx >= (n) ? pg_ascii_toupper((unsigned char) word[w_idx-(n)]) : '\0')
/* Previous letter. I dunno, should this return null on failure? */
#define Prev_Letter (Look_Back_Letter(1))
/* Look two letters down. It makes sure you don't walk off the string. */
#define After_Next_Letter \
- (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
-#define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
+ (Next_Letter != '\0' ? pg_ascii_toupper((unsigned char) word[w_idx+2]) : '\0')
+#define Look_Ahead_Letter(n) pg_ascii_toupper((unsigned char) Lookahead(word+w_idx, n))
/* Allows us to safely look ahead an arbitrary # of letters */
@@ -742,7 +742,7 @@ _soundex(const char *instr, char *outstr)
}
/* Take the first letter as is */
- *outstr++ = (char) toupper((unsigned char) *instr++);
+ *outstr++ = (char) pg_ascii_toupper((unsigned char) *instr++);
count = 1;
while (*instr && count < SOUNDEX_LEN)
--
2.43.0
From 7190291ec2acfab55f90504cc3a9c13bafc87364 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Wed, 19 Nov 2025 10:11:52 -0800
Subject: [PATCH v8 3/7] Add #define for UNICODE_CASEMAP_BUFSZ.
Useful for mapping a single codepoint at a time into a
statically-allocated buffer.
---
src/include/utils/pg_locale.h | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 683e1a0eef8..49fd22bf8eb 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -26,6 +26,17 @@
/* use for libc locale names */
#define LOCALE_NAME_BUFLEN 128
+/*
+ * Maximum number of bytes needed to map a single codepoint. Useful for
+ * mapping and processing a single input codepoint at a time with a
+ * statically-allocated buffer.
+ *
+ * With full case mapping, an input codepoint may be mapped to as many as
+ * three output codepoints. See Unicode 5.18.2, "Change in Length".
+ */
+#define UNICODE_CASEMAP_LEN 3
+#define UNICODE_CASEMAP_BUFSZ (UNICODE_CASEMAP_LEN * sizeof(char32_t))
+
/* GUC settings */
extern PGDLLIMPORT char *locale_messages;
extern PGDLLIMPORT char *locale_monetary;
--
2.43.0
From 735ee6342c2365f879c47c3aa0867c58174402aa Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Fri, 7 Nov 2025 12:11:34 -0800
Subject: [PATCH v8 4/7] Allow pg_locale_t APIs to work when ctype_is_c.
Previously, the caller needed to check ctype_is_c first for some
routines and not others. Now, the APIs consistently work, and the
caller can just check ctype_is_c for optimization purposes.
---
src/backend/utils/adt/like_support.c | 34 ++++----------
src/backend/utils/adt/pg_locale.c | 63 ++++++++++++++++++++++++--
src/backend/utils/adt/pg_locale_libc.c | 3 ++
3 files changed, 72 insertions(+), 28 deletions(-)
diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c
index 999f23f86d5..0debccfa67b 100644
--- a/src/backend/utils/adt/like_support.c
+++ b/src/backend/utils/adt/like_support.c
@@ -99,8 +99,6 @@ static Selectivity like_selectivity(const char *patt, int pattlen,
static Selectivity regex_selectivity(const char *patt, int pattlen,
bool case_insensitive,
int fixed_prefix_len);
-static int pattern_char_isalpha(char c, bool is_multibyte,
- pg_locale_t locale);
static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
Oid collation);
static Datum string_to_datum(const char *str, Oid datatype);
@@ -995,7 +993,6 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
Oid typeid = patt_const->consttype;
int pos,
match_pos;
- bool is_multibyte = (pg_database_encoding_max_length() > 1);
pg_locale_t locale = 0;
/* the right-hand const is type text or bytea */
@@ -1055,9 +1052,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
break;
}
- /* Stop if case-varying character (it's sort of a wildcard) */
- if (case_insensitive &&
- pattern_char_isalpha(patt[pos], is_multibyte, locale))
+ /*
+ * Stop if case-varying character (it's sort of a wildcard).
+ *
+ * In multibyte character sets or with non-libc providers, we can't
+ * use isalpha, and it does not seem worth trying to convert to
+ * wchar_t or char32_t. Instead, just pass the single byte to the
+ * provider, which will assume any non-ASCII char is potentially
+ * case-varying.
+ */
+ if (case_insensitive && char_is_cased(patt[pos], locale))
break;
match[match_pos++] = patt[pos];
@@ -1481,24 +1485,6 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
return sel;
}
-/*
- * Check whether char is a letter (and, hence, subject to case-folding)
- *
- * In multibyte character sets or with ICU, we can't use isalpha, and it does
- * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
- * Instead, just assume any non-ASCII char is potentially case-varying, and
- * hard-wire knowledge of which ASCII chars are letters.
- */
-static int
-pattern_char_isalpha(char c, bool is_multibyte,
- pg_locale_t locale)
-{
- if (locale->ctype_is_c)
- return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
- else
- return char_is_cased(c, locale);
-}
-
/*
* For bytea, the increment function need only increment the current byte
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index b14c7837938..9319fb633b6 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1261,6 +1261,17 @@ size_t
pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
+ if (locale->ctype == NULL)
+ {
+ int i;
+
+ srclen = (srclen >= 0) ? srclen : strlen(src);
+ for (i = 0; i < srclen && i < dstsize; i++)
+ dst[i] = pg_ascii_tolower(src[i]);
+ if (i < dstsize)
+ dst[i] = '\0';
+ return srclen;
+ }
return locale->ctype->strlower(dst, dstsize, src, srclen, locale);
}
@@ -1268,6 +1279,29 @@ size_t
pg_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
+ if (locale->ctype == NULL)
+ {
+ bool wasalnum = false;
+ int i;
+
+ srclen = (srclen >= 0) ? srclen : strlen(src);
+ for (i = 0; i < Min(srclen, dstsize); i++)
+ {
+ char c = src[i];
+
+ if (wasalnum)
+ dst[i] = pg_ascii_tolower(c);
+ else
+ dst[i] = pg_ascii_toupper(c);
+
+ wasalnum = ((c >= '0' && c <= '9') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= 'a' && c <= 'z'));
+ }
+ if (i < dstsize)
+ dst[i] = '\0';
+ return srclen;
+ }
return locale->ctype->strtitle(dst, dstsize, src, srclen, locale);
}
@@ -1275,6 +1309,17 @@ size_t
pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
+ if (locale->ctype == NULL)
+ {
+ int i;
+
+ srclen = (srclen >= 0) ? srclen : strlen(src);
+ for (i = 0; i < srclen && i < dstsize; i++)
+ dst[i] = pg_ascii_toupper(src[i]);
+ if (i < dstsize)
+ dst[i] = '\0';
+ return srclen;
+ }
return locale->ctype->strupper(dst, dstsize, src, srclen, locale);
}
@@ -1282,10 +1327,18 @@ size_t
pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
- if (locale->ctype->strfold)
- return locale->ctype->strfold(dst, dstsize, src, srclen, locale);
- else
- return locale->ctype->strlower(dst, dstsize, src, srclen, locale);
+ if (locale->ctype == NULL)
+ {
+ int i;
+
+ srclen = (srclen >= 0) ? srclen : strlen(src);
+ for (i = 0; i < srclen && i < dstsize; i++)
+ dst[i] = pg_ascii_tolower(src[i]);
+ if (i < dstsize)
+ dst[i] = '\0';
+ return srclen;
+ }
+ return locale->ctype->strfold(dst, dstsize, src, srclen, locale);
}
/*
@@ -1560,6 +1613,8 @@ pg_towlower(pg_wchar wc, pg_locale_t locale)
bool
char_is_cased(char ch, pg_locale_t locale)
{
+ if (locale->ctype == NULL)
+ return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
return locale->ctype->char_is_cased(ch, locale);
}
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 716f005066a..942454de4ed 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -326,6 +326,7 @@ static const struct ctype_methods ctype_methods_libc_sb = {
.strlower = strlower_libc_sb,
.strtitle = strtitle_libc_sb,
.strupper = strupper_libc_sb,
+ .strfold = strlower_libc_sb,
.wc_isdigit = wc_isdigit_libc_sb,
.wc_isalpha = wc_isalpha_libc_sb,
.wc_isalnum = wc_isalnum_libc_sb,
@@ -351,6 +352,7 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
.strlower = strlower_libc_mb,
.strtitle = strtitle_libc_mb,
.strupper = strupper_libc_mb,
+ .strfold = strlower_libc_mb,
.wc_isdigit = wc_isdigit_libc_sb,
.wc_isalpha = wc_isalpha_libc_sb,
.wc_isalnum = wc_isalnum_libc_sb,
@@ -372,6 +374,7 @@ static const struct ctype_methods ctype_methods_libc_utf8 = {
.strlower = strlower_libc_mb,
.strtitle = strtitle_libc_mb,
.strupper = strupper_libc_mb,
+ .strfold = strlower_libc_mb,
.wc_isdigit = wc_isdigit_libc_mb,
.wc_isalpha = wc_isalpha_libc_mb,
.wc_isalnum = wc_isalnum_libc_mb,
--
2.43.0
From 9cc6025640a2fdb5bee4a84598a3fdb352d81954 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Wed, 19 Nov 2025 10:20:36 -0800
Subject: [PATCH v8 5/7] Fix inconsistency between ltree_strncasecmp() and
ltree_crc32_sz().
Previously, ltree_strncasecmp() used lowercasing with the default
collation; while ltree_crc32_sz used tolower() directly. These were
equivalent only if the default collation provider was libc and the
encoding is single-byte.
Change both to use casefolding with the default collation.
---
contrib/ltree/crc32.c | 46 ++++++++++++++++++++++++++++++++-------
contrib/ltree/lquery_op.c | 31 ++++++++++++++++++++++++--
2 files changed, 67 insertions(+), 10 deletions(-)
diff --git a/contrib/ltree/crc32.c b/contrib/ltree/crc32.c
index 134f46a805e..3918d4a0ec2 100644
--- a/contrib/ltree/crc32.c
+++ b/contrib/ltree/crc32.c
@@ -10,31 +10,61 @@
#include "postgres.h"
#include "ltree.h"
+#include "crc32.h"
+#include "utils/pg_crc.h"
#ifdef LOWER_NODE
-#include <ctype.h>
-#define TOLOWER(x) tolower((unsigned char) (x))
-#else
-#define TOLOWER(x) (x)
+#include "utils/pg_locale.h"
#endif
-#include "crc32.h"
-#include "utils/pg_crc.h"
+#ifdef LOWER_NODE
unsigned int
ltree_crc32_sz(const char *buf, int size)
{
pg_crc32 crc;
const char *p = buf;
+ static pg_locale_t locale = NULL;
+
+ if (!locale)
+ locale = pg_database_locale();
INIT_TRADITIONAL_CRC32(crc);
while (size > 0)
{
- char c = (char) TOLOWER(*p);
+ char foldstr[UNICODE_CASEMAP_BUFSZ];
+ int srclen = pg_mblen(p);
+ size_t foldlen;
+
+ /* fold one codepoint at a time */
+ foldlen = pg_strfold(foldstr, UNICODE_CASEMAP_BUFSZ, p, srclen,
+ locale);
+
+ COMP_TRADITIONAL_CRC32(crc, foldstr, foldlen);
+
+ size -= srclen;
+ p += srclen;
+ }
+ FIN_TRADITIONAL_CRC32(crc);
+ return (unsigned int) crc;
+}
+
+#else
- COMP_TRADITIONAL_CRC32(crc, &c, 1);
+unsigned int
+ltree_crc32_sz(const char *buf, int size)
+{
+ pg_crc32 crc;
+ const char *p = buf;
+
+ INIT_TRADITIONAL_CRC32(crc);
+ while (size > 0)
+ {
+ COMP_TRADITIONAL_CRC32(crc, p, 1);
size--;
p++;
}
FIN_TRADITIONAL_CRC32(crc);
return (unsigned int) crc;
}
+
+#endif /* !LOWER_NODE */
diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c
index a6466f575fd..d6754eb613f 100644
--- a/contrib/ltree/lquery_op.c
+++ b/contrib/ltree/lquery_op.c
@@ -77,10 +77,37 @@ compare_subnode(ltree_level *t, char *qn, int len, int (*cmpptr) (const char *,
int
ltree_strncasecmp(const char *a, const char *b, size_t s)
{
- char *al = str_tolower(a, s, DEFAULT_COLLATION_OID);
- char *bl = str_tolower(b, s, DEFAULT_COLLATION_OID);
+ static pg_locale_t locale = NULL;
+ size_t al_sz = s + 1;
+ char *al = palloc(al_sz);
+ size_t bl_sz = s + 1;
+ char *bl = palloc(bl_sz);
+ size_t needed;
int res;
+ if (!locale)
+ locale = pg_database_locale();
+
+ needed = pg_strfold(al, al_sz, a, s, locale);
+ if (needed + 1 > al_sz)
+ {
+ /* grow buffer if needed and retry */
+ al_sz = needed + 1;
+ al = repalloc(al, al_sz);
+ needed = pg_strfold(al, al_sz, a, s, locale);
+ Assert(needed + 1 <= al_sz);
+ }
+
+ needed = pg_strfold(bl, bl_sz, b, s, locale);
+ if (needed + 1 > bl_sz)
+ {
+ /* grow buffer if needed and retry */
+ bl_sz = needed + 1;
+ bl = repalloc(bl, bl_sz);
+ needed = pg_strfold(bl, bl_sz, b, s, locale);
+ Assert(needed + 1 <= bl_sz);
+ }
+
res = strncmp(al, bl, s);
pfree(al);
--
2.43.0
From 709c38c8a3b992e5ddf2c6d93a838d7ef588c0f9 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Thu, 20 Nov 2025 13:09:26 -0800
Subject: [PATCH v8 6/7] Inline pg_ascii_tolower() and pg_ascii_toupper().
---
src/include/port.h | 25 +++++++++++++++++++++++--
src/port/pgstrcasecmp.c | 26 --------------------------
2 files changed, 23 insertions(+), 28 deletions(-)
diff --git a/src/include/port.h b/src/include/port.h
index 3964d3b1293..159c2bcd7e3 100644
--- a/src/include/port.h
+++ b/src/include/port.h
@@ -169,8 +169,29 @@ extern int pg_strcasecmp(const char *s1, const char *s2);
extern int pg_strncasecmp(const char *s1, const char *s2, size_t n);
extern unsigned char pg_toupper(unsigned char ch);
extern unsigned char pg_tolower(unsigned char ch);
-extern unsigned char pg_ascii_toupper(unsigned char ch);
-extern unsigned char pg_ascii_tolower(unsigned char ch);
+
+/*
+ * Fold a character to upper case, following C/POSIX locale rules.
+ */
+static inline unsigned char
+pg_ascii_toupper(unsigned char ch)
+{
+ if (ch >= 'a' && ch <= 'z')
+ ch += 'A' - 'a';
+ return ch;
+}
+
+/*
+ * Fold a character to lower case, following C/POSIX locale rules.
+ */
+static inline unsigned char
+pg_ascii_tolower(unsigned char ch)
+{
+ if (ch >= 'A' && ch <= 'Z')
+ ch += 'a' - 'A';
+ return ch;
+}
+
/*
* Beginning in v12, we always replace snprintf() and friends with our own
diff --git a/src/port/pgstrcasecmp.c b/src/port/pgstrcasecmp.c
index ec2b3a75c3d..17e93180381 100644
--- a/src/port/pgstrcasecmp.c
+++ b/src/port/pgstrcasecmp.c
@@ -13,10 +13,6 @@
*
* NB: this code should match downcase_truncate_identifier() in scansup.c.
*
- * We also provide strict ASCII-only case conversion functions, which can
- * be used to implement C/POSIX case folding semantics no matter what the
- * C library thinks the locale is.
- *
*
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
*
@@ -127,25 +123,3 @@ pg_tolower(unsigned char ch)
ch = tolower(ch);
return ch;
}
-
-/*
- * Fold a character to upper case, following C/POSIX locale rules.
- */
-unsigned char
-pg_ascii_toupper(unsigned char ch)
-{
- if (ch >= 'a' && ch <= 'z')
- ch += 'A' - 'a';
- return ch;
-}
-
-/*
- * Fold a character to lower case, following C/POSIX locale rules.
- */
-unsigned char
-pg_ascii_tolower(unsigned char ch)
-{
- if (ch >= 'A' && ch <= 'Z')
- ch += 'a' - 'A';
- return ch;
-}
--
2.43.0
From 81948ecc1e3c4f1b4bd79ecd96ac151e2332f3df Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Wed, 19 Nov 2025 18:16:41 -0800
Subject: [PATCH v8 7/7] Remove char_tolower() API.
It's only useful for an ILIKE optimization for the libc provider using
a single-byte encoding and a non-C locale, but it creates significant
internal complexity.
---
src/backend/utils/adt/like.c | 42 +++++++++-----------------
src/backend/utils/adt/like_match.c | 18 ++++++-----
src/backend/utils/adt/pg_locale.c | 22 --------------
src/backend/utils/adt/pg_locale_libc.c | 10 ------
src/include/utils/pg_locale.h | 9 ------
5 files changed, 25 insertions(+), 76 deletions(-)
diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index 4216ac17f43..4a7fc583c71 100644
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -43,8 +43,8 @@ static text *MB_do_like_escape(text *pat, text *esc);
static int UTF8_MatchText(const char *t, int tlen, const char *p, int plen,
pg_locale_t locale);
-static int SB_IMatchText(const char *t, int tlen, const char *p, int plen,
- pg_locale_t locale);
+static int C_IMatchText(const char *t, int tlen, const char *p, int plen,
+ pg_locale_t locale);
static int GenericMatchText(const char *s, int slen, const char *p, int plen, Oid collation);
static int Generic_Text_IC_like(text *str, text *pat, Oid collation);
@@ -84,22 +84,10 @@ wchareq(const char *p1, const char *p2)
* of getting a single character transformed to the system's wchar_t format.
* So now, we just downcase the strings using lower() and apply regular LIKE
* comparison. This should be revisited when we install better locale support.
- */
-
-/*
- * We do handle case-insensitive matching for single-byte encodings using
+ *
+ * We do handle case-insensitive matching for the C locale using
* fold-on-the-fly processing, however.
*/
-static char
-SB_lower_char(unsigned char c, pg_locale_t locale)
-{
- if (locale->ctype_is_c)
- return pg_ascii_tolower(c);
- else if (locale->is_default)
- return pg_tolower(c);
- else
- return char_tolower(c, locale);
-}
#define NextByte(p, plen) ((p)++, (plen)--)
@@ -131,9 +119,9 @@ SB_lower_char(unsigned char c, pg_locale_t locale)
#include "like_match.c"
/* setup to compile like_match.c for single byte case insensitive matches */
-#define MATCH_LOWER(t, locale) SB_lower_char((unsigned char) (t), locale)
+#define MATCH_LOWER
#define NextChar(p, plen) NextByte((p), (plen))
-#define MatchText SB_IMatchText
+#define MatchText C_IMatchText
#include "like_match.c"
@@ -202,22 +190,17 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)
errmsg("nondeterministic collations are not supported for ILIKE")));
/*
- * For efficiency reasons, in the single byte case we don't call lower()
- * on the pattern and text, but instead call SB_lower_char on each
- * character. In the multi-byte case we don't have much choice :-(. Also,
- * ICU does not support single-character case folding, so we go the long
- * way.
+ * For efficiency reasons, in the C locale we don't call lower() on the
+ * pattern and text, but instead call SB_lower_char on each character.
*/
- if (locale->ctype_is_c ||
- (char_tolower_enabled(locale) &&
- pg_database_encoding_max_length() == 1))
+ if (locale->ctype_is_c)
{
p = VARDATA_ANY(pat);
plen = VARSIZE_ANY_EXHDR(pat);
s = VARDATA_ANY(str);
slen = VARSIZE_ANY_EXHDR(str);
- return SB_IMatchText(s, slen, p, plen, locale);
+ return C_IMatchText(s, slen, p, plen, locale);
}
else
{
@@ -229,10 +212,13 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)
PointerGetDatum(str)));
s = VARDATA_ANY(str);
slen = VARSIZE_ANY_EXHDR(str);
+
if (GetDatabaseEncoding() == PG_UTF8)
return UTF8_MatchText(s, slen, p, plen, 0);
- else
+ else if (pg_database_encoding_max_length() > 1)
return MB_MatchText(s, slen, p, plen, 0);
+ else
+ return SB_MatchText(s, slen, p, plen, 0);
}
}
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c
index 892f8a745ea..54846c9541d 100644
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -70,10 +70,14 @@
*--------------------
*/
+/*
+ * MATCH_LOWER is defined for ILIKE in the C locale as an optimization. Other
+ * locales must casefold the inputs before matching.
+ */
#ifdef MATCH_LOWER
-#define GETCHAR(t, locale) MATCH_LOWER(t, locale)
+#define GETCHAR(t) pg_ascii_tolower(t)
#else
-#define GETCHAR(t, locale) (t)
+#define GETCHAR(t) (t)
#endif
static int
@@ -105,7 +109,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
ereport(ERROR,
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
errmsg("LIKE pattern must not end with escape character")));
- if (GETCHAR(*p, locale) != GETCHAR(*t, locale))
+ if (GETCHAR(*p) != GETCHAR(*t))
return LIKE_FALSE;
}
else if (*p == '%')
@@ -167,14 +171,14 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
ereport(ERROR,
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
errmsg("LIKE pattern must not end with escape character")));
- firstpat = GETCHAR(p[1], locale);
+ firstpat = GETCHAR(p[1]);
}
else
- firstpat = GETCHAR(*p, locale);
+ firstpat = GETCHAR(*p);
while (tlen > 0)
{
- if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic))
+ if (GETCHAR(*t) == firstpat || (locale && !locale->deterministic))
{
int matched = MatchText(t, tlen, p, plen, locale);
@@ -342,7 +346,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
NextChar(t1, t1len);
}
}
- else if (GETCHAR(*p, locale) != GETCHAR(*t, locale))
+ else if (GETCHAR(*p) != GETCHAR(*t))
{
/* non-wildcard pattern char fails to match text char */
return LIKE_FALSE;
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 9319fb633b6..b3afa6cad6c 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1618,28 +1618,6 @@ char_is_cased(char ch, pg_locale_t locale)
return locale->ctype->char_is_cased(ch, locale);
}
-/*
- * char_tolower_enabled()
- *
- * Does the provider support char_tolower()?
- */
-bool
-char_tolower_enabled(pg_locale_t locale)
-{
- return (locale->ctype->char_tolower != NULL);
-}
-
-/*
- * char_tolower()
- *
- * Convert char (single-byte encoding) to lowercase.
- */
-char
-char_tolower(unsigned char ch, pg_locale_t locale)
-{
- return locale->ctype->char_tolower(ch, locale);
-}
-
/*
* Return required encoding ID for the given locale, or -1 if any encoding is
* valid for the locale.
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 942454de4ed..3407e15712b 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -248,13 +248,6 @@ wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
#endif
}
-static char
-char_tolower_libc(unsigned char ch, pg_locale_t locale)
-{
- Assert(pg_database_encoding_max_length() == 1);
- return tolower_l(ch, locale->lt);
-}
-
static bool
char_is_cased_libc(char ch, pg_locale_t locale)
{
@@ -338,7 +331,6 @@ static const struct ctype_methods ctype_methods_libc_sb = {
.wc_isspace = wc_isspace_libc_sb,
.wc_isxdigit = wc_isxdigit_libc_sb,
.char_is_cased = char_is_cased_libc,
- .char_tolower = char_tolower_libc,
.wc_toupper = toupper_libc_sb,
.wc_tolower = tolower_libc_sb,
.max_chr = UCHAR_MAX,
@@ -364,7 +356,6 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
.wc_isspace = wc_isspace_libc_sb,
.wc_isxdigit = wc_isxdigit_libc_sb,
.char_is_cased = char_is_cased_libc,
- .char_tolower = char_tolower_libc,
.wc_toupper = toupper_libc_sb,
.wc_tolower = tolower_libc_sb,
.max_chr = UCHAR_MAX,
@@ -386,7 +377,6 @@ static const struct ctype_methods ctype_methods_libc_utf8 = {
.wc_isspace = wc_isspace_libc_mb,
.wc_isxdigit = wc_isxdigit_libc_mb,
.char_is_cased = char_is_cased_libc,
- .char_tolower = char_tolower_libc,
.wc_toupper = toupper_libc_mb,
.wc_tolower = tolower_libc_mb,
};
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 49fd22bf8eb..5e21b517e96 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -125,13 +125,6 @@ struct ctype_methods
/* required */
bool (*char_is_cased) (char ch, pg_locale_t locale);
- /*
- * Optional. If defined, will only be called for single-byte encodings. If
- * not defined, or if the encoding is multibyte, will fall back to
- * pg_strlower().
- */
- char (*char_tolower) (unsigned char ch, pg_locale_t locale);
-
/*
* For regex and pattern matching efficiency, the maximum char value
* supported by the above methods. If zero, limit is set by regex code.
@@ -188,8 +181,6 @@ extern pg_locale_t pg_newlocale_from_collation(Oid collid);
extern char *get_collation_actual_version(char collprovider, const char *collcollate);
extern bool char_is_cased(char ch, pg_locale_t locale);
-extern bool char_tolower_enabled(pg_locale_t locale);
-extern char char_tolower(unsigned char ch, pg_locale_t locale);
extern size_t pg_strlower(char *dst, size_t dstsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
--
2.43.0