tsvector and tsquery are not collatable types, but they do need locale
information to parse the original text. It would not do any good to
make it a collatable type, because a COLLATE clause would typically be
applied after the parsing is done.
Previously, tsearch used the database CTYPE for parsing, but that's not
good because it creates an unnecessary dependency on libc even when the
user has requested another provider.
This patch series allows tsearch to use the database default locale for
parsing. If the database collation is libc, there's no change.
Motivation:
(a) it reduces the dependence on setlocale(), which is not thread-
safe;
(b) if a user is using the builtin or ICU providers, understanding
the effects of LC_CTYPE can be very confusing;
(c) it would allow us to test more of the tsearch parsing behavior.
Notes:
* Should have the the exact same behavior as before if the database
locale provider is libc. If the database locale provider is builtin or
ICU, then there will be some differences in tsearch parsing behavior.
* Most of the patches are straightforward, but v1-0005 might need extra
attention. There are quite a few cases there with subtle distinctions,
and I might have missed something. For example, in the "C" locale,
tsearch treats non-ascii characters as alpha, even though the libc
functions do not do so (I preserved this behavior).
* This introduces redundancy between the character isxyz() functions in
recg_pg_locale.c and similar functions in pg_locale.c. It would be easy
enough to refactor to eliminate the redundancy, but that might have
performance implications, so I didn't do it yet.
Regards,
Jeff Davis
From 2abc0d7e29e5d6edd401ccfcab4cb281e9b420a1 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 6 Oct 2025 11:45:00 -0700
Subject: [PATCH v1 1/6] Rename static functions pg_wc_xyz() to regc_wc_xyz().
The former names are better suited to a set of exported functions,
which will be added in an upcoming commit.
---
src/backend/regex/regc_locale.c | 50 +++++++++++++++---------------
src/backend/regex/regc_pg_locale.c | 34 ++++++++++----------
src/backend/regex/regcomp.c | 24 +++++++-------
src/include/regex/regcustom.h | 8 ++---
4 files changed, 58 insertions(+), 58 deletions(-)
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
index 77d1ce28168..252418f4482 100644
--- a/src/backend/regex/regc_locale.c
+++ b/src/backend/regex/regc_locale.c
@@ -453,7 +453,7 @@ range(struct vars *v, /* context */
for (c = a; c <= b; c++)
{
- cc = pg_wc_tolower(c);
+ cc = regc_wc_tolower(c);
if (cc != c &&
(before(cc, a) || before(b, cc)))
{
@@ -464,7 +464,7 @@ range(struct vars *v, /* context */
}
addchr(cv, cc);
}
- cc = pg_wc_toupper(c);
+ cc = regc_wc_toupper(c);
if (cc != c &&
(before(cc, a) || before(b, cc)))
{
@@ -594,16 +594,16 @@ cclasscvec(struct vars *v, /* context */
switch (cclasscode)
{
case CC_PRINT:
- cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode);
+ cv = pg_ctype_get_cache(regc_wc_isprint, cclasscode);
break;
case CC_ALNUM:
- cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode);
+ cv = pg_ctype_get_cache(regc_wc_isalnum, cclasscode);
break;
case CC_ALPHA:
- cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode);
+ cv = pg_ctype_get_cache(regc_wc_isalpha, cclasscode);
break;
case CC_WORD:
- cv = pg_ctype_get_cache(pg_wc_isword, cclasscode);
+ cv = pg_ctype_get_cache(regc_wc_isword, cclasscode);
break;
case CC_ASCII:
/* hard-wired meaning */
@@ -624,10 +624,10 @@ cclasscvec(struct vars *v, /* context */
addrange(cv, 0x7f, 0x9f);
break;
case CC_DIGIT:
- cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode);
+ cv = pg_ctype_get_cache(regc_wc_isdigit, cclasscode);
break;
case CC_PUNCT:
- cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode);
+ cv = pg_ctype_get_cache(regc_wc_ispunct, cclasscode);
break;
case CC_XDIGIT:
@@ -645,16 +645,16 @@ cclasscvec(struct vars *v, /* context */
}
break;
case CC_SPACE:
- cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode);
+ cv = pg_ctype_get_cache(regc_wc_isspace, cclasscode);
break;
case CC_LOWER:
- cv = pg_ctype_get_cache(pg_wc_islower, cclasscode);
+ cv = pg_ctype_get_cache(regc_wc_islower, cclasscode);
break;
case CC_UPPER:
- cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode);
+ cv = pg_ctype_get_cache(regc_wc_isupper, cclasscode);
break;
case CC_GRAPH:
- cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode);
+ cv = pg_ctype_get_cache(regc_wc_isgraph, cclasscode);
break;
}
@@ -679,29 +679,29 @@ cclass_column_index(struct colormap *cm, chr c)
* Note: we should not see requests to consider cclasses that are not
* treated as locale-specific by cclasscvec(), above.
*/
- if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
+ if (cm->classbits[CC_PRINT] && regc_wc_isprint(c))
colnum |= cm->classbits[CC_PRINT];
- if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c))
+ if (cm->classbits[CC_ALNUM] && regc_wc_isalnum(c))
colnum |= cm->classbits[CC_ALNUM];
- if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
+ if (cm->classbits[CC_ALPHA] && regc_wc_isalpha(c))
colnum |= cm->classbits[CC_ALPHA];
- if (cm->classbits[CC_WORD] && pg_wc_isword(c))
+ if (cm->classbits[CC_WORD] && regc_wc_isword(c))
colnum |= cm->classbits[CC_WORD];
assert(cm->classbits[CC_ASCII] == 0);
assert(cm->classbits[CC_BLANK] == 0);
assert(cm->classbits[CC_CNTRL] == 0);
- if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c))
+ if (cm->classbits[CC_DIGIT] && regc_wc_isdigit(c))
colnum |= cm->classbits[CC_DIGIT];
- if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c))
+ if (cm->classbits[CC_PUNCT] && regc_wc_ispunct(c))
colnum |= cm->classbits[CC_PUNCT];
assert(cm->classbits[CC_XDIGIT] == 0);
- if (cm->classbits[CC_SPACE] && pg_wc_isspace(c))
+ if (cm->classbits[CC_SPACE] && regc_wc_isspace(c))
colnum |= cm->classbits[CC_SPACE];
- if (cm->classbits[CC_LOWER] && pg_wc_islower(c))
+ if (cm->classbits[CC_LOWER] && regc_wc_islower(c))
colnum |= cm->classbits[CC_LOWER];
- if (cm->classbits[CC_UPPER] && pg_wc_isupper(c))
+ if (cm->classbits[CC_UPPER] && regc_wc_isupper(c))
colnum |= cm->classbits[CC_UPPER];
- if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c))
+ if (cm->classbits[CC_GRAPH] && regc_wc_isgraph(c))
colnum |= cm->classbits[CC_GRAPH];
return colnum;
@@ -721,8 +721,8 @@ allcases(struct vars *v, /* context */
chr lc,
uc;
- lc = pg_wc_tolower(c);
- uc = pg_wc_toupper(c);
+ lc = regc_wc_tolower(c);
+ uc = regc_wc_toupper(c);
cv = getcvec(v, 2, 0);
addchr(cv, lc);
@@ -760,7 +760,7 @@ casecmp(const chr *x, const chr *y, /* strings to compare */
{
for (; len > 0; len--, x++, y++)
{
- if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
+ if ((*x != *y) && (regc_wc_tolower(*x) != regc_wc_tolower(*y)))
return 1;
}
return 0;
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index d9eab5357bc..c236d44624f 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -228,7 +228,7 @@ pg_set_regex_collation(Oid collation)
}
static int
-pg_wc_isdigit(pg_wchar c)
+regc_wc_isdigit(pg_wchar c)
{
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
@@ -238,7 +238,7 @@ pg_wc_isdigit(pg_wchar c)
}
static int
-pg_wc_isalpha(pg_wchar c)
+regc_wc_isalpha(pg_wchar c)
{
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
@@ -248,7 +248,7 @@ pg_wc_isalpha(pg_wchar c)
}
static int
-pg_wc_isalnum(pg_wchar c)
+regc_wc_isalnum(pg_wchar c)
{
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
@@ -258,16 +258,16 @@ pg_wc_isalnum(pg_wchar c)
}
static int
-pg_wc_isword(pg_wchar c)
+regc_wc_isword(pg_wchar c)
{
/* We define word characters as alnum class plus underscore */
if (c == CHR('_'))
return 1;
- return pg_wc_isalnum(c);
+ return regc_wc_isalnum(c);
}
static int
-pg_wc_isupper(pg_wchar c)
+regc_wc_isupper(pg_wchar c)
{
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
@@ -277,7 +277,7 @@ pg_wc_isupper(pg_wchar c)
}
static int
-pg_wc_islower(pg_wchar c)
+regc_wc_islower(pg_wchar c)
{
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
@@ -287,7 +287,7 @@ pg_wc_islower(pg_wchar c)
}
static int
-pg_wc_isgraph(pg_wchar c)
+regc_wc_isgraph(pg_wchar c)
{
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
@@ -297,7 +297,7 @@ pg_wc_isgraph(pg_wchar c)
}
static int
-pg_wc_isprint(pg_wchar c)
+regc_wc_isprint(pg_wchar c)
{
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
@@ -307,7 +307,7 @@ pg_wc_isprint(pg_wchar c)
}
static int
-pg_wc_ispunct(pg_wchar c)
+regc_wc_ispunct(pg_wchar c)
{
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
@@ -317,7 +317,7 @@ pg_wc_ispunct(pg_wchar c)
}
static int
-pg_wc_isspace(pg_wchar c)
+regc_wc_isspace(pg_wchar c)
{
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
@@ -327,7 +327,7 @@ pg_wc_isspace(pg_wchar c)
}
static pg_wchar
-pg_wc_toupper(pg_wchar c)
+regc_wc_toupper(pg_wchar c)
{
if (pg_regex_locale->ctype_is_c)
{
@@ -340,7 +340,7 @@ pg_wc_toupper(pg_wchar c)
}
static pg_wchar
-pg_wc_tolower(pg_wchar c)
+regc_wc_tolower(pg_wchar c)
{
if (pg_regex_locale->ctype_is_c)
{
@@ -366,11 +366,11 @@ pg_wc_tolower(pg_wchar c)
* the main regex code expects us to return a failure indication instead.
*/
-typedef int (*pg_wc_probefunc) (pg_wchar c);
+typedef int (*regc_wc_probefunc) (pg_wchar c);
typedef struct pg_ctype_cache
{
- pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */
+ regc_wc_probefunc probefunc; /* regc_wc_isalpha or a sibling */
pg_locale_t locale; /* locale this entry is for */
struct cvec cv; /* cache entry contents */
struct pg_ctype_cache *next; /* chain link */
@@ -419,14 +419,14 @@ store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
}
/*
- * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
+ * Given a probe function (e.g., regc_wc_isalpha) get a struct cvec for all
* chrs satisfying the probe function. The active collation is the one
* previously set by pg_set_regex_collation. Return NULL if out of memory.
*
* Note that the result must not be freed or modified by caller.
*/
static struct cvec *
-pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
+pg_ctype_get_cache(regc_wc_probefunc probefunc, int cclasscode)
{
pg_ctype_cache *pcc;
pg_wchar max_chr;
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 15b264e50f1..3e18e4a78a2 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -249,18 +249,18 @@ static struct cvec *getcvec(struct vars *v, int nchrs, int nranges);
static void freecvec(struct cvec *cv);
/* === regc_pg_locale.c === */
-static int pg_wc_isdigit(pg_wchar c);
-static int pg_wc_isalpha(pg_wchar c);
-static int pg_wc_isalnum(pg_wchar c);
-static int pg_wc_isword(pg_wchar c);
-static int pg_wc_isupper(pg_wchar c);
-static int pg_wc_islower(pg_wchar c);
-static int pg_wc_isgraph(pg_wchar c);
-static int pg_wc_isprint(pg_wchar c);
-static int pg_wc_ispunct(pg_wchar c);
-static int pg_wc_isspace(pg_wchar c);
-static pg_wchar pg_wc_toupper(pg_wchar c);
-static pg_wchar pg_wc_tolower(pg_wchar c);
+static int regc_wc_isdigit(pg_wchar c);
+static int regc_wc_isalpha(pg_wchar c);
+static int regc_wc_isalnum(pg_wchar c);
+static int regc_wc_isword(pg_wchar c);
+static int regc_wc_isupper(pg_wchar c);
+static int regc_wc_islower(pg_wchar c);
+static int regc_wc_isgraph(pg_wchar c);
+static int regc_wc_isprint(pg_wchar c);
+static int regc_wc_ispunct(pg_wchar c);
+static int regc_wc_isspace(pg_wchar c);
+static pg_wchar regc_wc_toupper(pg_wchar c);
+static pg_wchar regc_wc_tolower(pg_wchar c);
/* === regc_locale.c === */
static chr element(struct vars *v, const chr *startp, const chr *endp);
diff --git a/src/include/regex/regcustom.h b/src/include/regex/regcustom.h
index af0fe97c796..1c0e92f168f 100644
--- a/src/include/regex/regcustom.h
+++ b/src/include/regex/regcustom.h
@@ -88,10 +88,10 @@ typedef unsigned uchr; /* unsigned type that will hold a chr */
#define MAX_SIMPLE_CHR 0x7FF /* suitable value for Unicode */
/* functions operating on chr */
-#define iscalnum(x) pg_wc_isalnum(x)
-#define iscalpha(x) pg_wc_isalpha(x)
-#define iscdigit(x) pg_wc_isdigit(x)
-#define iscspace(x) pg_wc_isspace(x)
+#define iscalnum(x) regc_wc_isalnum(x)
+#define iscalpha(x) regc_wc_isalpha(x)
+#define iscdigit(x) regc_wc_isdigit(x)
+#define iscspace(x) regc_wc_isspace(x)
/* and pick up the standard header */
#include "regex.h"
--
2.43.0
From 4dd4eba11c8ffc0c2910a820cf3b591945e7dd42 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 6 Oct 2025 12:29:33 -0700
Subject: [PATCH v1 2/6] Add pg_wc_xyz() exported functions.
Useful for tsearch and possibly other places.
Does not remove the regc_wc_xyz() versions, which may be done in a
subsequent commit but would require a bit more refactoring.
---
src/backend/regex/regc_pg_locale.c | 150 ++-------------------------
src/backend/utils/adt/pg_locale.c | 123 ++++++++++++++++++++++
src/include/utils/pg_locale.h | 12 +++
src/include/utils/pg_locale_c.h | 160 +++++++++++++++++++++++++++++
4 files changed, 301 insertions(+), 144 deletions(-)
create mode 100644 src/include/utils/pg_locale_c.h
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index c236d44624f..0647da9a848 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -19,6 +19,7 @@
#include "common/unicode_case.h"
#include "common/unicode_category.h"
#include "utils/pg_locale.h"
+#include "utils/pg_locale_c.h"
static pg_locale_t pg_regex_locale;
@@ -27,150 +28,6 @@ static struct pg_locale_struct dummy_c_locale = {
.ctype_is_c = true,
};
-/*
- * Hard-wired character properties for C locale
- */
-#define PG_ISDIGIT 0x01
-#define PG_ISALPHA 0x02
-#define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA)
-#define PG_ISUPPER 0x04
-#define PG_ISLOWER 0x08
-#define PG_ISGRAPH 0x10
-#define PG_ISPRINT 0x20
-#define PG_ISPUNCT 0x40
-#define PG_ISSPACE 0x80
-
-static const unsigned char pg_char_properties[128] = {
- /* NUL */ 0,
- /* ^A */ 0,
- /* ^B */ 0,
- /* ^C */ 0,
- /* ^D */ 0,
- /* ^E */ 0,
- /* ^F */ 0,
- /* ^G */ 0,
- /* ^H */ 0,
- /* ^I */ PG_ISSPACE,
- /* ^J */ PG_ISSPACE,
- /* ^K */ PG_ISSPACE,
- /* ^L */ PG_ISSPACE,
- /* ^M */ PG_ISSPACE,
- /* ^N */ 0,
- /* ^O */ 0,
- /* ^P */ 0,
- /* ^Q */ 0,
- /* ^R */ 0,
- /* ^S */ 0,
- /* ^T */ 0,
- /* ^U */ 0,
- /* ^V */ 0,
- /* ^W */ 0,
- /* ^X */ 0,
- /* ^Y */ 0,
- /* ^Z */ 0,
- /* ^[ */ 0,
- /* ^\ */ 0,
- /* ^] */ 0,
- /* ^^ */ 0,
- /* ^_ */ 0,
- /* */ PG_ISPRINT | PG_ISSPACE,
- /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
- /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
- /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
- /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
- /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
- /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
- /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
- /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
- /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
- /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
- /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
- /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
- /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
- /* DEL */ 0
-};
-
/*
* pg_set_regex_collation: set collation for these functions to obey
@@ -227,6 +84,11 @@ pg_set_regex_collation(Oid collation)
pg_regex_locale = locale;
}
+/*
+ * The following functions are redundant with those defined in
+ * pg_locale.c. XXX: refactor without adding overhead.
+ */
+
static int
regc_wc_isdigit(pg_wchar c)
{
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 97c2ac1faf9..07d26bb5e02 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -45,6 +45,7 @@
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/pg_locale.h"
+#include "utils/pg_locale_c.h"
#include "utils/relcache.h"
#include "utils/syscache.h"
@@ -1149,6 +1150,13 @@ init_database_collation(void)
PGLOCALE_SUPPORT_ERROR(dbform->datlocprovider);
result->is_default = true;
+
+ Assert((result->collate_is_c && result->collate == NULL) ||
+ (!result->collate_is_c && result->collate != NULL));
+
+ Assert((result->ctype_is_c && result->ctype == NULL) ||
+ (!result->ctype_is_c && result->ctype != NULL));
+
ReleaseSysCache(tup);
default_locale = result;
@@ -1395,6 +1403,121 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
return locale->collate->strnxfrm_prefix(dest, destsize, src, srclen, locale);
}
+bool
+pg_wc_isdigit(pg_wchar wc, pg_locale_t locale)
+{
+ if (locale->ctype_is_c)
+ return (wc <= (pg_wchar) 127 &&
+ (pg_char_properties[wc] & PG_ISDIGIT));
+ else
+ return locale->ctype->wc_isdigit(wc, locale);
+}
+
+bool
+pg_wc_isalpha(pg_wchar wc, pg_locale_t locale)
+{
+ if (locale->ctype_is_c)
+ return (wc <= (pg_wchar) 127 &&
+ (pg_char_properties[wc] & PG_ISALPHA));
+ else
+ return locale->ctype->wc_isalpha(wc, locale);
+}
+
+bool
+pg_wc_isalnum(pg_wchar wc, pg_locale_t locale)
+{
+ if (locale->ctype_is_c)
+ return (wc <= (pg_wchar) 127 &&
+ (pg_char_properties[wc] & PG_ISALNUM));
+ else
+ return locale->ctype->wc_isalnum(wc, locale);
+}
+
+bool
+pg_wc_isupper(pg_wchar wc, pg_locale_t locale)
+{
+ if (locale->ctype_is_c)
+ return (wc <= (pg_wchar) 127 &&
+ (pg_char_properties[wc] & PG_ISUPPER));
+ else
+ return locale->ctype->wc_isupper(wc, locale);
+}
+
+bool
+pg_wc_islower(pg_wchar wc, pg_locale_t locale)
+{
+ if (locale->ctype_is_c)
+ return (wc <= (pg_wchar) 127 &&
+ (pg_char_properties[wc] & PG_ISLOWER));
+ else
+ return locale->ctype->wc_islower(wc, locale);
+}
+
+bool
+pg_wc_isgraph(pg_wchar wc, pg_locale_t locale)
+{
+ if (locale->ctype_is_c)
+ return (wc <= (pg_wchar) 127 &&
+ (pg_char_properties[wc] & PG_ISGRAPH));
+ else
+ return locale->ctype->wc_isgraph(wc, locale);
+}
+
+bool
+pg_wc_isprint(pg_wchar wc, pg_locale_t locale)
+{
+ if (locale->ctype_is_c)
+ return (wc <= (pg_wchar) 127 &&
+ (pg_char_properties[wc] & PG_ISPRINT));
+ else
+ return locale->ctype->wc_isprint(wc, locale);
+}
+
+bool
+pg_wc_ispunct(pg_wchar wc, pg_locale_t locale)
+{
+ if (locale->ctype_is_c)
+ return (wc <= (pg_wchar) 127 &&
+ (pg_char_properties[wc] & PG_ISPUNCT));
+ else
+ return locale->ctype->wc_ispunct(wc, locale);
+}
+
+bool
+pg_wc_isspace(pg_wchar wc, pg_locale_t locale)
+{
+ if (locale->ctype_is_c)
+ return (wc <= (pg_wchar) 127 &&
+ (pg_char_properties[wc] & PG_ISSPACE));
+ else
+ return locale->ctype->wc_isspace(wc, locale);
+}
+
+pg_wchar
+pg_wc_toupper(pg_wchar wc, pg_locale_t locale)
+{
+ if (locale->ctype_is_c)
+ {
+ if (wc <= (pg_wchar) 127)
+ return pg_ascii_toupper((unsigned char) wc);
+ return wc;
+ }
+ return locale->ctype->wc_toupper(wc, locale);
+}
+
+pg_wchar
+pg_wc_tolower(pg_wchar wc, pg_locale_t locale)
+{
+ if (locale->ctype_is_c)
+ {
+ if (wc <= (pg_wchar) 127)
+ return pg_ascii_tolower((unsigned char) wc);
+ return wc;
+ }
+ else
+ return locale->ctype->wc_tolower(wc, locale);
+}
+
/*
* char_is_cased()
*
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 7e83594fbaf..93a23bfe24c 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -208,6 +208,18 @@ extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
ssize_t srclen, pg_locale_t locale);
+extern bool pg_wc_isdigit(pg_wchar wc, pg_locale_t locale);
+extern bool pg_wc_isalpha(pg_wchar wc, pg_locale_t locale);
+extern bool pg_wc_isalnum(pg_wchar wc, pg_locale_t locale);
+extern bool pg_wc_isupper(pg_wchar wc, pg_locale_t locale);
+extern bool pg_wc_islower(pg_wchar wc, pg_locale_t locale);
+extern bool pg_wc_isgraph(pg_wchar wc, pg_locale_t locale);
+extern bool pg_wc_isprint(pg_wchar wc, pg_locale_t locale);
+extern bool pg_wc_ispunct(pg_wchar wc, pg_locale_t locale);
+extern bool pg_wc_isspace(pg_wchar wc, pg_locale_t locale);
+extern pg_wchar pg_wc_toupper(pg_wchar wc, pg_locale_t locale);
+extern pg_wchar pg_wc_tolower(pg_wchar wc, pg_locale_t locale);
+
extern int builtin_locale_encoding(const char *locale);
extern const char *builtin_validate_locale(int encoding, const char *locale);
extern void icu_validate_locale(const char *loc_str);
diff --git a/src/include/utils/pg_locale_c.h b/src/include/utils/pg_locale_c.h
new file mode 100644
index 00000000000..11a0f996db2
--- /dev/null
+++ b/src/include/utils/pg_locale_c.h
@@ -0,0 +1,160 @@
+/*-----------------------------------------------------------------------
+ *
+ * PostgreSQL locale utilities
+ *
+ * src/include/utils/pg_locale_c.h
+ *
+ * Copyright (c) 2002-2025, PostgreSQL Global Development Group
+ *
+ *-----------------------------------------------------------------------
+ */
+
+#ifndef _PG_LOCALE_C_
+#define _PG_LOCALE_C_
+
+/*
+ * Hard-wired character properties for C locale
+ */
+
+#define PG_ISDIGIT 0x01
+#define PG_ISALPHA 0x02
+#define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA)
+#define PG_ISUPPER 0x04
+#define PG_ISLOWER 0x08
+#define PG_ISGRAPH 0x10
+#define PG_ISPRINT 0x20
+#define PG_ISPUNCT 0x40
+#define PG_ISSPACE 0x80
+
+static const unsigned char pg_char_properties[128] = {
+ /* NUL */ 0,
+ /* ^A */ 0,
+ /* ^B */ 0,
+ /* ^C */ 0,
+ /* ^D */ 0,
+ /* ^E */ 0,
+ /* ^F */ 0,
+ /* ^G */ 0,
+ /* ^H */ 0,
+ /* ^I */ PG_ISSPACE,
+ /* ^J */ PG_ISSPACE,
+ /* ^K */ PG_ISSPACE,
+ /* ^L */ PG_ISSPACE,
+ /* ^M */ PG_ISSPACE,
+ /* ^N */ 0,
+ /* ^O */ 0,
+ /* ^P */ 0,
+ /* ^Q */ 0,
+ /* ^R */ 0,
+ /* ^S */ 0,
+ /* ^T */ 0,
+ /* ^U */ 0,
+ /* ^V */ 0,
+ /* ^W */ 0,
+ /* ^X */ 0,
+ /* ^Y */ 0,
+ /* ^Z */ 0,
+ /* ^[ */ 0,
+ /* ^\ */ 0,
+ /* ^] */ 0,
+ /* ^^ */ 0,
+ /* ^_ */ 0,
+ /* */ PG_ISPRINT | PG_ISSPACE,
+ /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* DEL */ 0
+};
+
+#endif /* _PG_LOCALE_C_ */
--
2.43.0
From e30ab34f6a9df6f1d0e64438b3632f3ed9ce3452 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 6 Oct 2025 13:01:25 -0700
Subject: [PATCH v1 3/6] Add pg_wc_isxdigit(), useful for tsearch.
---
src/backend/utils/adt/pg_locale.c | 12 ++++++++++++
src/backend/utils/adt/pg_locale_builtin.c | 7 +++++++
src/backend/utils/adt/pg_locale_icu.c | 7 +++++++
src/backend/utils/adt/pg_locale_libc.c | 23 +++++++++++++++++++++++
src/include/utils/pg_locale.h | 2 ++
5 files changed, 51 insertions(+)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 07d26bb5e02..c06004400fc 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1493,6 +1493,18 @@ pg_wc_isspace(pg_wchar wc, pg_locale_t locale)
return locale->ctype->wc_isspace(wc, locale);
}
+bool
+pg_wc_isxdigit(pg_wchar wc, pg_locale_t locale)
+{
+ if (locale->ctype_is_c)
+ return (wc <= (pg_wchar) 127 &&
+ ((pg_char_properties[wc] & PG_ISDIGIT) ||
+ ((wc >= 'A' && wc <= 'F') ||
+ (wc >= 'a' && wc <= 'f'))));
+ else
+ return locale->ctype->wc_isxdigit(wc, locale);
+}
+
pg_wchar
pg_wc_toupper(pg_wchar wc, pg_locale_t locale)
{
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 526ab3c6711..3dc611b50e1 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -163,6 +163,12 @@ wc_isspace_builtin(pg_wchar wc, pg_locale_t locale)
return pg_u_isspace(wc);
}
+static bool
+wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
+{
+ return pg_u_isxdigit(wc, !locale->builtin.casemap_full);
+}
+
static bool
char_is_cased_builtin(char ch, pg_locale_t locale)
{
@@ -196,6 +202,7 @@ static const struct ctype_methods ctype_methods_builtin = {
.wc_isprint = wc_isprint_builtin,
.wc_ispunct = wc_ispunct_builtin,
.wc_isspace = wc_isspace_builtin,
+ .wc_isxdigit = wc_isxdigit_builtin,
.char_is_cased = char_is_cased_builtin,
.wc_tolower = wc_tolower_builtin,
.wc_toupper = wc_toupper_builtin,
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index 9f0b4eead73..05bad202669 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -212,6 +212,12 @@ wc_isspace_icu(pg_wchar wc, pg_locale_t locale)
return u_isspace(wc);
}
+static bool
+wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale)
+{
+ return u_isxdigit(wc);
+}
+
static const struct ctype_methods ctype_methods_icu = {
.strlower = strlower_icu,
.strtitle = strtitle_icu,
@@ -226,6 +232,7 @@ static const struct ctype_methods ctype_methods_icu = {
.wc_isprint = wc_isprint_icu,
.wc_ispunct = wc_ispunct_icu,
.wc_isspace = wc_isspace_icu,
+ .wc_isxdigit = wc_isxdigit_icu,
.char_is_cased = char_is_cased_icu,
.wc_toupper = toupper_icu,
.wc_tolower = tolower_icu,
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index f56b5dbdd37..34865ccf00e 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -172,6 +172,16 @@ wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
return isspace_l((unsigned char) wc, locale->lt);
}
+static bool
+wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+#ifndef WIN32
+ return isxdigit_l((unsigned char) wc, locale->lt);
+#else
+ return _isxdigit_l((unsigned char) wc, locale->lt);
+#endif
+}
+
static bool
wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
{
@@ -226,6 +236,16 @@ wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
return iswspace_l((wint_t) wc, locale->lt);
}
+static bool
+wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
+{
+#ifndef WIN32
+ return iswxdigit_l((wint_t) wc, locale->lt);
+#else
+ return _iswxdigit_l((wint_t) wc, locale->lt);
+#endif
+}
+
static char
char_tolower_libc(unsigned char ch, pg_locale_t locale)
{
@@ -313,6 +333,7 @@ static const struct ctype_methods ctype_methods_libc_sb = {
.wc_isprint = wc_isprint_libc_sb,
.wc_ispunct = wc_ispunct_libc_sb,
.wc_isspace = wc_isspace_libc_sb,
+ .wc_isxdigit = wc_isxdigit_libc_sb,
.char_is_cased = char_is_cased_libc,
.char_tolower = char_tolower_libc,
.wc_toupper = toupper_libc_sb,
@@ -337,6 +358,7 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
.wc_isprint = wc_isprint_libc_sb,
.wc_ispunct = wc_ispunct_libc_sb,
.wc_isspace = wc_isspace_libc_sb,
+ .wc_isxdigit = wc_isxdigit_libc_sb,
.char_is_cased = char_is_cased_libc,
.char_tolower = char_tolower_libc,
.wc_toupper = toupper_libc_sb,
@@ -357,6 +379,7 @@ static const struct ctype_methods ctype_methods_libc_utf8 = {
.wc_isprint = wc_isprint_libc_mb,
.wc_ispunct = wc_ispunct_libc_mb,
.wc_isspace = wc_isspace_libc_mb,
+ .wc_isxdigit = wc_isxdigit_libc_mb,
.char_is_cased = char_is_cased_libc,
.char_tolower = char_tolower_libc,
.wc_toupper = toupper_libc_mb,
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 93a23bfe24c..2f6b04062f2 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -110,6 +110,7 @@ struct ctype_methods
bool (*wc_isprint) (pg_wchar wc, pg_locale_t locale);
bool (*wc_ispunct) (pg_wchar wc, pg_locale_t locale);
bool (*wc_isspace) (pg_wchar wc, pg_locale_t locale);
+ bool (*wc_isxdigit) (pg_wchar wc, pg_locale_t locale);
pg_wchar (*wc_toupper) (pg_wchar wc, pg_locale_t locale);
pg_wchar (*wc_tolower) (pg_wchar wc, pg_locale_t locale);
@@ -217,6 +218,7 @@ extern bool pg_wc_isgraph(pg_wchar wc, pg_locale_t locale);
extern bool pg_wc_isprint(pg_wchar wc, pg_locale_t locale);
extern bool pg_wc_ispunct(pg_wchar wc, pg_locale_t locale);
extern bool pg_wc_isspace(pg_wchar wc, pg_locale_t locale);
+extern bool pg_wc_isxdigit(pg_wchar wc, pg_locale_t locale);
extern pg_wchar pg_wc_toupper(pg_wchar wc, pg_locale_t locale);
extern pg_wchar pg_wc_tolower(pg_wchar wc, pg_locale_t locale);
--
2.43.0
From 12e67f9e85154e471f5906e73f314c49d20af111 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 6 Oct 2025 14:24:59 -0700
Subject: [PATCH v1 4/6] Add pg_database_locale() to retrieve database default
locale.
---
src/backend/utils/adt/pg_locale.c | 9 +++++++++
src/include/utils/pg_locale.h | 1 +
2 files changed, 10 insertions(+)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index c06004400fc..5570402b35e 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1162,6 +1162,15 @@ init_database_collation(void)
default_locale = result;
}
+/*
+ * Get database default locale.
+ */
+pg_locale_t
+pg_database_locale(void)
+{
+ return pg_newlocale_from_collation(DEFAULT_COLLATION_OID);
+}
+
/*
* Create a pg_locale_t from a collation OID. Results are cached for the
* lifetime of the backend. Thus, do not free the result with freelocale().
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 2f6b04062f2..259fc70bb78 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -176,6 +176,7 @@ struct pg_locale_struct
};
extern void init_database_collation(void);
+extern pg_locale_t pg_database_locale(void);
extern pg_locale_t pg_newlocale_from_collation(Oid collid);
extern char *get_collation_actual_version(char collprovider, const char *collcollate);
--
2.43.0
From 4f8fa0fcc3efe4297aca58ee28f047c47a576d84 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 6 Oct 2025 13:05:17 -0700
Subject: [PATCH v1 5/6] tsearch: use database default collation for parsing.
Previously, tsearch used the database's CTYPE setting, which only
matches the database default collation if the locale provider is libc.
Note that tsearch types (tsvector and tsquery) are not collatable
types. The locale affects parsing the original text, which is a lossy
process, so a COLLATE clause on the already-parsed value would not
make sense.
---
src/backend/tsearch/ts_locale.c | 40 ++++++-----------
src/backend/tsearch/wparser_def.c | 71 ++++++-------------------------
2 files changed, 27 insertions(+), 84 deletions(-)
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
index 4801fe90089..9db13b72f99 100644
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -20,45 +20,33 @@
static void tsearch_readline_callback(void *arg);
-/*
- * The reason these functions use a 3-wchar_t output buffer, not 2 as you
- * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
- * getting from char2wchar() is UTF16 not UTF32. A single input character
- * may therefore produce a surrogate pair rather than just one wchar_t;
- * we also need room for a trailing null. When we do get a surrogate pair,
- * we pass just the first code to iswdigit() etc, so that these functions will
- * always return false for characters outside the Basic Multilingual Plane.
- */
-#define WC_BUF_LEN 3
+/* space for a single character plus a trailing NUL */
+#define WC_BUF_LEN 2
int
t_isalpha(const char *ptr)
{
- int clen = pg_mblen(ptr);
- wchar_t character[WC_BUF_LEN];
- locale_t mylocale = 0; /* TODO */
+ pg_wchar wstr[WC_BUF_LEN];
+ int wlen pg_attribute_unused();
- if (clen == 1 || database_ctype_is_c)
- return isalpha(TOUCHAR(ptr));
+ wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
+ Assert(wlen <= 1);
- char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
-
- return iswalpha((wint_t) character[0]);
+ /* pass single character, or NUL if empty */
+ return pg_wc_isalpha(wstr[0], pg_database_locale());
}
int
t_isalnum(const char *ptr)
{
- int clen = pg_mblen(ptr);
- wchar_t character[WC_BUF_LEN];
- locale_t mylocale = 0; /* TODO */
-
- if (clen == 1 || database_ctype_is_c)
- return isalnum(TOUCHAR(ptr));
+ pg_wchar wstr[WC_BUF_LEN];
+ int wlen pg_attribute_unused();
- char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
+ wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
+ Assert(wlen <= 1);
- return iswalnum((wint_t) character[0]);
+ /* pass single character, or NUL if empty */
+ return pg_wc_isalnum(wstr[0], pg_database_locale());
}
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index e2dd3da3aa3..e9129040422 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -243,9 +243,7 @@ typedef struct TParser
/* string and position information */
char *str; /* multibyte string */
int lenstr; /* length of mbstring */
- wchar_t *wstr; /* wide character string */
pg_wchar *pgwstr; /* wide character string for C-locale */
- bool usewide;
/* State of parse */
int charmaxlen;
@@ -293,33 +291,8 @@ TParserInit(char *str, int len)
prs->charmaxlen = pg_database_encoding_max_length();
prs->str = str;
prs->lenstr = len;
-
- /*
- * Use wide char code only when max encoding length > 1.
- */
- if (prs->charmaxlen > 1)
- {
- locale_t mylocale = 0; /* TODO */
-
- prs->usewide = true;
- if (database_ctype_is_c)
- {
- /*
- * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
- * be different from sizeof(wchar_t)
- */
- prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
- pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
- }
- else
- {
- prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
- char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
- mylocale);
- }
- }
- else
- prs->usewide = false;
+ prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
+ pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
prs->state = newTParserPosition(NULL);
prs->state->state = TPS_Base;
@@ -350,12 +323,9 @@ TParserCopyInit(const TParser *orig)
prs->charmaxlen = orig->charmaxlen;
prs->str = orig->str + orig->state->posbyte;
prs->lenstr = orig->lenstr - orig->state->posbyte;
- prs->usewide = orig->usewide;
if (orig->pgwstr)
prs->pgwstr = orig->pgwstr + orig->state->poschar;
- if (orig->wstr)
- prs->wstr = orig->wstr + orig->state->poschar;
prs->state = newTParserPosition(NULL);
prs->state->state = TPS_Base;
@@ -379,8 +349,6 @@ TParserClose(TParser *prs)
prs->state = ptr;
}
- if (prs->wstr)
- pfree(prs->wstr);
if (prs->pgwstr)
pfree(prs->pgwstr);
@@ -412,13 +380,9 @@ TParserCopyClose(TParser *prs)
/*
- * Character-type support functions, equivalent to is* macros, but
- * working with any possible encodings and locales. Notes:
- * - with multibyte encoding and C-locale isw* function may fail
- * or give wrong result.
- * - multibyte encoding and C-locale often are used for
- * Asian languages.
- * - if locale is C then we use pgwstr instead of wstr.
+ * Character-type support functions using the database default locale. If the
+ * locale is C, and the input character is non-ascii, the value to be returned
+ * is determined by the 'nonascii' macro argument.
*/
#define p_iswhat(type, nonascii) \
@@ -426,19 +390,13 @@ TParserCopyClose(TParser *prs)
static int \
p_is##type(TParser *prs) \
{ \
+ pg_locale_t locale = pg_database_locale(); \
+ pg_wchar wc; \
Assert(prs->state); \
- if (prs->usewide) \
- { \
- if (prs->pgwstr) \
- { \
- unsigned int c = *(prs->pgwstr + prs->state->poschar); \
- if (c > 0x7f) \
- return nonascii; \
- return is##type(c); \
- } \
- return isw##type(*(prs->wstr + prs->state->poschar)); \
- } \
- return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
+ wc = prs->pgwstr[prs->state->poschar]; \
+ if (prs->charmaxlen > 1 && locale->ctype_is_c && wc > 0x7f) \
+ return nonascii; \
+ return pg_wc_is##type(wc, pg_database_locale()); \
} \
\
static int \
@@ -703,7 +661,7 @@ p_isspecial(TParser *prs)
* Check that only in utf encoding, because other encodings aren't
* supported by postgres or even exists.
*/
- if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
+ if (GetDatabaseEncoding() == PG_UTF8)
{
static const pg_wchar strange_letter[] = {
/*
@@ -944,10 +902,7 @@ p_isspecial(TParser *prs)
*StopMiddle;
pg_wchar c;
- if (prs->pgwstr)
- c = *(prs->pgwstr + prs->state->poschar);
- else
- c = (pg_wchar) *(prs->wstr + prs->state->poschar);
+ c = *(prs->pgwstr + prs->state->poschar);
while (StopLow < StopHigh)
{
--
2.43.0
From 66d41b9bd5fd6f84667278809707308d3be3a282 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Tue, 7 Oct 2025 14:20:48 -0700
Subject: [PATCH v1 6/6] Remove obsolete global database_ctype_is_c.
Now that tsearch uses the database default locale, there's no need to
track the database CTYPE separately.
---
src/backend/utils/adt/pg_locale.c | 3 ---
src/backend/utils/init/postinit.c | 4 ----
src/include/utils/pg_locale.h | 3 ---
3 files changed, 10 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 5570402b35e..81a80bf075e 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -100,9 +100,6 @@ char *localized_full_days[7 + 1];
char *localized_abbrev_months[12 + 1];
char *localized_full_months[12 + 1];
-/* is the databases's LC_CTYPE the C locale? */
-bool database_ctype_is_c = false;
-
static pg_locale_t default_locale = NULL;
/* indicates whether locale information cache is valid */
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 641e535a73c..98f9598cd78 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -430,10 +430,6 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect
" which is not recognized by setlocale().", ctype),
errhint("Recreate the database with another locale or install the missing locale.")));
- if (strcmp(ctype, "C") == 0 ||
- strcmp(ctype, "POSIX") == 0)
- database_ctype_is_c = true;
-
init_database_collation();
/*
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 259fc70bb78..af6a6d8bd63 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -39,9 +39,6 @@ extern PGDLLIMPORT char *localized_full_days[];
extern PGDLLIMPORT char *localized_abbrev_months[];
extern PGDLLIMPORT char *localized_full_months[];
-/* is the databases's LC_CTYPE the C locale? */
-extern PGDLLIMPORT bool database_ctype_is_c;
-
extern bool check_locale(int category, const char *locale, char **canonname);
extern char *pg_perm_setlocale(int category, const char *locale);
--
2.43.0