On Wed, 2025-12-17 at 11:37 -0800, Jeff Davis wrote:
> On Tue, 2025-12-16 at 07:34 +0800, Chao Li wrote:
> > > <v2-0001-Make-utf8_to_unicode-safer.patch>
> >
> > V2 LGTM.
>
> On second thought, if we're going to change something here, we should
> probably have a more flexible API for both utf8_to_unicode() and
> unicode_to_utf8().
New series:
0001: validates UTF8 before calling into unicode_case.c. Extra defense,
and simple to backport, but regresses performance of those functions.
It also might risk errors if somehow there is invalid UTF8.
0002: refactors to create an error path from unicode_case.c into
pg_locale_builtin.c, where a proper error can be thrown. This wins back
the performance lost in the previous commit. This is perhaps
backportable, but technically it changes an exported function
signature, so carries some very low risk.
0003: Adds utf8encode() and utf8decode(), which are iteration-friendly
and inlinable, and fully-validate UTF8 (e.g. rejects surrogate halves).
This is an enhancement so should not be backported.
0004: Make use of new API from unicode_case.c.
Regards,
Jeff Davis
From 4d59a316a147d68ff0113cec6b969037d2ee169e Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Fri, 19 Jun 2026 14:09:31 -0700
Subject: [PATCH v3 1/4] unicode_case.c: ensure valid UTF8.
Should be valid, but check before calling unicode_strlower(), etc.
Discussion: https://postgr.es/m/[email protected]
Reviewed-by: Chao Li <[email protected]>
Backpatch-through: 17
---
src/backend/utils/adt/pg_locale_builtin.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 01d4f55b07e..7f167e751ea 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -86,6 +86,7 @@ static size_t
strlower_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
pg_locale_t locale)
{
+ pg_verifymbstr(src, srclen, false);
return unicode_strlower(dest, destsize, src, srclen,
locale->builtin.casemap_full);
}
@@ -103,6 +104,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
.prev_alnum = false,
};
+ pg_verifymbstr(src, srclen, false);
return unicode_strtitle(dest, destsize, src, srclen,
locale->builtin.casemap_full,
initcap_wbnext, &wbstate);
@@ -112,6 +114,7 @@ static size_t
strupper_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
pg_locale_t locale)
{
+ pg_verifymbstr(src, srclen, false);
return unicode_strupper(dest, destsize, src, srclen,
locale->builtin.casemap_full);
}
@@ -120,6 +123,7 @@ static size_t
strfold_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
pg_locale_t locale)
{
+ pg_verifymbstr(src, srclen, false);
return unicode_strfold(dest, destsize, src, srclen,
locale->builtin.casemap_full);
}
--
2.43.0
From fffa7153f563a19663a02e44196f377b83bf217f Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Thu, 16 Apr 2026 14:56:11 -0700
Subject: [PATCH v3 2/4] Move UTF8 checks into unicode_case.c.
Pre-checking UTF-8 is inefficient. Refactor the error paths so we can
catch UTF-8 errors while iterating, and return back to
pg_locale_builtin.c where we can throw the error.
Reviewed-by: Chao Li <[email protected]>
Discussion: https://postgr.es/m/[email protected]
---
src/backend/utils/adt/pg_locale_builtin.c | 85 +++++++++++++++------
src/common/unicode/case_test.c | 33 +++++---
src/common/unicode_case.c | 92 ++++++++++++++++-------
src/include/common/unicode_case.h | 8 +-
4 files changed, 156 insertions(+), 62 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 7f167e751ea..96da9c6fcf3 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -62,21 +62,32 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len)
{
- char32_t u = utf8_to_unicode((const unsigned char *) wbstate->str +
+ int ulen = pg_utf_mblen((const unsigned char *) wbstate->str +
wbstate->offset);
- bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
+ char32_t u;
+ bool curr_alnum;
+
+ if (wbstate->offset + ulen > wbstate->len)
+ {
+ wbstate->offset = wbstate->len;
+ return wbstate->len;
+ }
+
+ u = utf8_to_unicode((const unsigned char *) wbstate->str +
+ wbstate->offset);
+ curr_alnum = pg_u_isalnum(u, wbstate->posix);
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
{
size_t prev_offset = wbstate->offset;
wbstate->init = true;
- wbstate->offset += unicode_utf8len(u);
+ wbstate->offset += ulen;
wbstate->prev_alnum = curr_alnum;
return prev_offset;
}
- wbstate->offset += unicode_utf8len(u);
+ wbstate->offset += ulen;
}
return wbstate->len;
@@ -86,9 +97,16 @@ static size_t
strlower_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
pg_locale_t locale)
{
- pg_verifymbstr(src, srclen, false);
- return unicode_strlower(dest, destsize, src, srclen,
- locale->builtin.casemap_full);
+ size_t consumed;
+ size_t result;
+
+ result = unicode_strlower(dest, destsize, src, srclen, &consumed,
+ locale->builtin.casemap_full);
+ if (consumed < srclen)
+ report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+ srclen - consumed);
+
+ return result;
}
static size_t
@@ -96,36 +114,57 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
pg_locale_t locale)
{
struct WordBoundaryState wbstate = {
- .str = src,
- .len = srclen,
- .offset = 0,
- .posix = !locale->builtin.casemap_full,
- .init = false,
- .prev_alnum = false,
+ .str = src,
+ .len = srclen,
+ .offset = 0,
+ .posix = !locale->builtin.casemap_full,
+ .init = false,
+ .prev_alnum = false,
};
+ size_t consumed;
+ size_t result;
+
+ result = unicode_strtitle(dest, destsize, src, srclen, &consumed,
+ locale->builtin.casemap_full,
+ initcap_wbnext, &wbstate);
- pg_verifymbstr(src, srclen, false);
- return unicode_strtitle(dest, destsize, src, srclen,
- locale->builtin.casemap_full,
- initcap_wbnext, &wbstate);
+ if (consumed < srclen)
+ report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+ srclen - consumed);
+
+ return result;
}
static size_t
strupper_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
pg_locale_t locale)
{
- pg_verifymbstr(src, srclen, false);
- return unicode_strupper(dest, destsize, src, srclen,
- locale->builtin.casemap_full);
+ size_t consumed;
+ size_t result;
+
+ result = unicode_strupper(dest, destsize, src, srclen, &consumed,
+ locale->builtin.casemap_full);
+ if (consumed < srclen)
+ report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+ srclen - consumed);
+
+ return result;
}
static size_t
strfold_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
pg_locale_t locale)
{
- pg_verifymbstr(src, srclen, false);
- return unicode_strfold(dest, destsize, src, srclen,
- locale->builtin.casemap_full);
+ size_t consumed;
+ size_t result;
+
+ result = unicode_strfold(dest, destsize, src, srclen, &consumed,
+ locale->builtin.casemap_full);
+ if (consumed < srclen)
+ report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+ srclen - consumed);
+
+ return result;
}
static bool
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index a0dbf00b671..ae0f86ffa0c 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -115,6 +115,7 @@ icu_test_full(char *str)
char icu_fold[BUFSZ];
UErrorCode status;
size_t len = strlen(str);
+ size_t consumed;
/* full case mapping doesn't use posix semantics */
struct WordBoundaryState wbstate = {
@@ -126,10 +127,10 @@ icu_test_full(char *str)
.prev_alnum = false,
};
- unicode_strlower(lower, BUFSZ, str, len, true);
- unicode_strtitle(title, BUFSZ, str, len, true, initcap_wbnext, &wbstate);
- unicode_strupper(upper, BUFSZ, str, len, true);
- unicode_strfold(fold, BUFSZ, str, len, true);
+ unicode_strlower(lower, BUFSZ, str, len, &consumed, true);
+ unicode_strtitle(title, BUFSZ, str, len, &consumed, true, initcap_wbnext, &wbstate);
+ unicode_strupper(upper, BUFSZ, str, len, &consumed, true);
+ unicode_strfold(fold, BUFSZ, str, len, &consumed, true);
status = U_ZERO_ERROR;
ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, len, &status);
status = U_ZERO_ERROR;
@@ -260,13 +261,16 @@ static size_t
tfunc_lower(char *dst, size_t dstsize, const char *src,
size_t srclen)
{
- return unicode_strlower(dst, dstsize, src, srclen, true);
+ size_t consumed;
+
+ return unicode_strlower(dst, dstsize, src, srclen, &consumed, true);
}
static size_t
tfunc_title(char *dst, size_t dstsize, const char *src,
size_t srclen)
{
+ size_t consumed;
struct WordBoundaryState wbstate = {
.str = src,
.len = srclen,
@@ -275,27 +279,34 @@ tfunc_title(char *dst, size_t dstsize, const char *src,
.prev_alnum = false,
};
- return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
- &wbstate);
+ return unicode_strtitle(dst, dstsize, src, srclen, &consumed, true,
+ initcap_wbnext, &wbstate);
}
static size_t
tfunc_upper(char *dst, size_t dstsize, const char *src,
size_t srclen)
{
- return unicode_strupper(dst, dstsize, src, srclen, true);
+ size_t consumed;
+
+ return unicode_strupper(dst, dstsize, src, srclen, &consumed, true);
}
static size_t
tfunc_fold(char *dst, size_t dstsize, const char *src,
size_t srclen)
{
- return unicode_strfold(dst, dstsize, src, srclen, true);
+ size_t consumed;
+
+ return unicode_strfold(dst, dstsize, src, srclen, &consumed, true);
}
static void
test_convert_case(void)
{
+ size_t needed;
+ size_t consumed;
+
/* test string with no case changes */
test_convert(tfunc_lower, "√∞", "√∞");
/* test adjust-to-cased behavior */
@@ -320,6 +331,10 @@ test_convert_case(void)
/* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
test_convert(tfunc_title, "\uFF11a", "\uFF11a");
+ /* invalid UTF8 */
+ needed = unicode_strfold(NULL, 0, "abc\xCE", 4, &consumed, false);
+ Assert(consumed == 3);
+ Assert(needed == 3);
#ifdef USE_ICU
icu_test_full("");
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index d6ee00b7d9c..4a692cfa249 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -40,8 +40,8 @@ static const char32_t *const casekind_map[NCaseKind] =
static char32_t find_case_map(char32_t ucs, const char32_t *map);
static size_t convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
- CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
- void *wbstate);
+ size_t *pconsumed, CaseKind str_casekind, bool full,
+ WordBoundaryNext wbnext, void *wbstate);
static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
const char *src, size_t srclen, size_t srcoff,
char32_t *simple, const char32_t **special);
@@ -82,7 +82,8 @@ unicode_casefold_simple(char32_t code)
* unicode_strlower()
*
* Convert src to lowercase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
*
* String src must be encoded in UTF-8.
*
@@ -98,17 +99,18 @@ unicode_casefold_simple(char32_t code)
*/
size_t
unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen,
- bool full)
+ size_t *pconsumed, bool full)
{
- return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
- NULL);
+ return convert_case(dst, dstsize, src, srclen, pconsumed, CaseLower, full,
+ NULL, NULL);
}
/*
* unicode_strtitle()
*
* Convert src to titlecase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
*
* String src must be encoded in UTF-8.
*
@@ -134,17 +136,19 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen,
*/
size_t
unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen,
- bool full, WordBoundaryNext wbnext, void *wbstate)
+ size_t *pconsumed, bool full, WordBoundaryNext wbnext,
+ void *wbstate)
{
- return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
- wbstate);
+ return convert_case(dst, dstsize, src, srclen, pconsumed, CaseTitle, full,
+ wbnext, wbstate);
}
/*
* unicode_strupper()
*
* Convert src to uppercase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
*
* String src must be encoded in UTF-8.
*
@@ -160,17 +164,18 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen,
*/
size_t
unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen,
- bool full)
+ size_t *pconsumed, bool full)
{
- return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
- NULL);
+ return convert_case(dst, dstsize, src, srclen, pconsumed, CaseUpper, full,
+ NULL, NULL);
}
/*
* unicode_strfold()
*
* Case fold src, and return the result length (not including terminating
- * NUL).
+ * NUL). Sets *pconsumed to the amount of src successfully consumed; if less
+ * than srclen, indicates a decoding error.
*
* String src must be encoded in UTF-8.
*
@@ -183,10 +188,26 @@ unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen,
*/
size_t
unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
- bool full)
+ size_t *pconsumed, bool full)
{
- return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
- NULL);
+ return convert_case(dst, dstsize, src, srclen, pconsumed, CaseFold, full,
+ NULL, NULL);
+}
+
+/* local version of pg_utf_mblen() to be inlinable */
+static int
+utf8_mblen(const unsigned char *s)
+{
+ if ((*s & 0x80) == 0)
+ return 1;
+ else if ((*s & 0xe0) == 0xc0)
+ return 2;
+ else if ((*s & 0xf0) == 0xe0)
+ return 3;
+ else if ((*s & 0xf8) == 0xf0)
+ return 4;
+ else
+ return -1;
}
/*
@@ -207,8 +228,8 @@ unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
*/
static size_t
convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
- CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
- void *wbstate)
+ size_t *pconsumed, CaseKind str_casekind, bool full,
+ WordBoundaryNext wbnext, void *wbstate)
{
/* character CaseKind varies while titlecasing */
CaseKind chr_casekind = str_casekind;
@@ -227,12 +248,18 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
while (srcoff < srclen)
{
- char32_t u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
- int u1len = unicode_utf8len(u1);
+ int u1len = utf8_mblen((const unsigned char *) src + srcoff);
+ char32_t u1;
char32_t simple = 0;
const char32_t *special = NULL;
enum CaseMapResult casemap_result;
+ /* invalid UTF8 */
+ if (u1len < 0 || srcoff + u1len > srclen)
+ break;
+
+ u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
+
if (str_casekind == CaseTitle)
{
if (srcoff == boundary)
@@ -293,6 +320,7 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
if (result_len < dstsize)
dst[result_len] = '\0';
+ *pconsumed = srcoff;
return result_len;
}
@@ -316,7 +344,14 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- char32_t curr = utf8_to_unicode(str + i);
+ int u1len = utf8_mblen((const unsigned char *) str + i);
+ char32_t curr;
+
+ /* invalid UTF8 */
+ if (u1len < 0 || i + u1len > len)
+ return false;
+
+ curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -327,8 +362,6 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
}
else if ((str[i] & 0xC0) == 0x80)
continue;
-
- Assert(false); /* invalid UTF-8 */
}
/* end of string is not followed by a Cased character */
@@ -340,7 +373,14 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- char32_t curr = utf8_to_unicode(str + i);
+ int u1len = utf8_mblen((const unsigned char *) str + i);
+ char32_t curr;
+
+ /* invalid UTF8 */
+ if (u1len < 0 || i + u1len > len)
+ return false;
+
+ curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h
index 03add78cabe..1cbc0c14bc2 100644
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -21,13 +21,13 @@ char32_t unicode_titlecase_simple(char32_t code);
char32_t unicode_uppercase_simple(char32_t code);
char32_t unicode_casefold_simple(char32_t code);
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
- size_t srclen, bool full);
+ size_t srclen, size_t *pconsumed, bool full);
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
- size_t srclen, bool full,
+ size_t srclen, size_t *pconsumed, bool full,
WordBoundaryNext wbnext, void *wbstate);
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
- size_t srclen, bool full);
+ size_t srclen, size_t *pconsumed, bool full);
size_t unicode_strfold(char *dst, size_t dstsize, const char *src,
- size_t srclen, bool full);
+ size_t srclen, size_t *pconsumed, bool full);
#endif /* UNICODE_CASE_H */
--
2.43.0
From ba582fcb653d27110675831f29b3b088609c02ff Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Thu, 4 Jun 2026 12:08:51 -0700
Subject: [PATCH v3 3/4] Validating, iterator-friendly UTF8 encoder/decoder
API.
Reviewed-by: Chao Li <[email protected]>
Discussion: https://postgr.es/m/[email protected]
---
src/include/mb/pg_wchar.h | 160 +++++++++++++++++++++++++++++++++++++-
1 file changed, 158 insertions(+), 2 deletions(-)
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index deee2a832c3..d8ea77c3fe0 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -27,6 +27,11 @@
*/
typedef unsigned int pg_wchar;
+/*
+ * Returned for decoding failures in utf8decode() and utf8_to_unicode().
+ */
+#define PG_INVALID_CODEPOINT 0xFFFFFFFF
+
/*
* Maximum byte length of multibyte characters in any backend encoding
*/
@@ -392,11 +397,161 @@ surrogate_pair_to_codepoint(char16_t first, char16_t second)
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
}
+/*
+ * Encode the codepoint as UTF8 and return the number of bytes required. If
+ * the number of bytes required exceeds dstsize, just return the number of
+ * bytes required without modifying dst. If dstsize is zero, dst may be
+ * NULL. If codepoint is not a valid Unicode Scalar, return -1.
+ */
+static inline int
+utf8encode(unsigned char *dst, size_t dstsize, char32_t codepoint)
+{
+ int nbytes;
+
+ if (codepoint <= 0x7F)
+ nbytes = 1;
+ else if (codepoint <= 0x7FF)
+ nbytes = 2;
+ else if (codepoint <= 0xFFFF)
+ {
+ /* surrogate halves not valid for UTF8 */
+ if (codepoint >= 0xD800 && codepoint <= 0xDFFF)
+ return -1;
+ nbytes = 3;
+ }
+ else if (codepoint <= 0x10FFFF)
+ nbytes = 4;
+ else
+ return -1;
+
+ if (nbytes > dstsize)
+ return nbytes;
+
+ if (codepoint <= 0x7F)
+ {
+ dst[0] = codepoint;
+ }
+ else if (codepoint <= 0x7FF)
+ {
+ dst[0] = 0xC0 | ((codepoint >> 6) & 0x1F);
+ dst[1] = 0x80 | (codepoint & 0x3F);
+ }
+ else if (codepoint <= 0xFFFF)
+ {
+ dst[0] = 0xE0 | ((codepoint >> 12) & 0x0F);
+ dst[1] = 0x80 | ((codepoint >> 6) & 0x3F);
+ dst[2] = 0x80 | (codepoint & 0x3F);
+ }
+ else if (codepoint <= 0x10FFFF)
+ {
+ dst[0] = 0xF0 | ((codepoint >> 18) & 0x07);
+ dst[1] = 0x80 | ((codepoint >> 12) & 0x3F);
+ dst[2] = 0x80 | ((codepoint >> 6) & 0x3F);
+ dst[3] = 0x80 | (codepoint & 0x3F);
+ }
+
+ return nbytes;
+}
+
+/*
+ * Decode the next Unicode codepoint from UTF8 at src, reading no more than
+ * srclen bytes (which must be at least 1). On success, *pcodepoint will be a
+ * valid Unicode Scalar; otherwise it will be set to PG_INVALID_CODEPOINT.
+ *
+ * Returns the number of bytes consumed. If srclen is not large enough
+ * (i.e. src is truncated in the middle of a sequence), returns 0. If invalid,
+ * returns -1.
+ */
+static inline int
+utf8decode(char32_t *pcodepoint, const unsigned char *src, size_t srclen)
+{
+ int nbytes;
+ char32_t codepoint;
+
+ Assert(srclen >= 1);
+
+ if ((*src & 0x80) == 0)
+ {
+ *pcodepoint = (char32_t) src[0];
+ return 1;
+ }
+
+ if ((*src & 0xe0) == 0xc0)
+ nbytes = 2;
+ else if ((*src & 0xf0) == 0xe0)
+ nbytes = 3;
+ else if ((*src & 0xf8) == 0xf0)
+ nbytes = 4;
+ else
+ goto invalid;
+
+ /* truncated */
+ if (srclen < nbytes)
+ {
+ *pcodepoint = PG_INVALID_CODEPOINT;
+ return 0;
+ }
+
+ if (nbytes == 2)
+ {
+ /* check continuation byte */
+ if ((src[1] & 0xc0) != 0x80)
+ goto invalid;
+
+ codepoint = (char32_t) (((src[0] & 0x1f) << 6) |
+ (src[1] & 0x3f));
+
+ /* overlong */
+ if (codepoint < 0x0080)
+ goto invalid;
+ }
+ else if (nbytes == 3)
+ {
+ /* check continuation bytes */
+ if ((src[1] & 0xc0) != 0x80 || (src[2] & 0xc0) != 0x80)
+ goto invalid;
+
+ codepoint = (char32_t) (((src[0] & 0x0f) << 12) |
+ ((src[1] & 0x3f) << 6) |
+ (src[2] & 0x3f));
+
+ /* overlong or surrogate half */
+ if (codepoint < 0x0800 ||
+ (codepoint >= 0xD800 && codepoint <= 0xDFFF))
+ goto invalid;
+ }
+ else if (nbytes == 4)
+ {
+ /* check continuation bytes */
+ if ((src[1] & 0xc0) != 0x80 || (src[2] & 0xc0) != 0x80 ||
+ (src[3] & 0xc0) != 0x80)
+ goto invalid;
+
+ codepoint = (char32_t) (((src[0] & 0x07) << 18) |
+ ((src[1] & 0x3f) << 12) |
+ ((src[2] & 0x3f) << 6) |
+ (src[3] & 0x3f));
+
+ /* overlong or out-of-range */
+ if (codepoint < 0x10000 || codepoint > 0x10FFFF)
+ goto invalid;
+ }
+
+ *pcodepoint = codepoint;
+ return nbytes;
+
+invalid:
+ *pcodepoint = PG_INVALID_CODEPOINT;
+ return -1;
+}
+
/*
* Convert a UTF-8 character to a Unicode code point.
* This is a one-character version of pg_utf2wchar_with_len.
*
* No error checks here, c must point to a long-enough string.
+ *
+ * XXX: Callers should consider utf8decode() instead.
*/
static inline char32_t
utf8_to_unicode(const unsigned char *c)
@@ -416,13 +571,14 @@ utf8_to_unicode(const unsigned char *c)
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
else
- /* that is an invalid code on purpose */
- return 0xffffffff;
+ return PG_INVALID_CODEPOINT;
}
/*
* Map a Unicode code point to UTF-8. utf8string must have at least
* unicode_utf8len(c) bytes available.
+ *
+ * XXX: Callers should consider utf8encode() instead.
*/
static inline unsigned char *
unicode_to_utf8(char32_t c, unsigned char *utf8string)
--
2.43.0
From 942f374a4383502cbb66d0d83f0ba317962a9a24 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Fri, 19 Jun 2026 14:58:47 -0700
Subject: [PATCH v3 4/4] unicode_case.c: use new utf8encode/utf8decode APIs.
Reviewed-by: Chao Li <[email protected]>
Discussion: https://postgr.es/m/[email protected]
---
src/backend/utils/adt/pg_locale_builtin.c | 17 +++---
src/common/unicode/case_test.c | 2 +-
src/common/unicode_case.c | 70 +++++++++++------------
3 files changed, 44 insertions(+), 45 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 96da9c6fcf3..a826ec1cfa8 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -62,25 +62,26 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len)
{
- int ulen = pg_utf_mblen((const unsigned char *) wbstate->str +
- wbstate->offset);
+ int ulen;
char32_t u;
bool curr_alnum;
+ size_t prev_offset = wbstate->offset;
- if (wbstate->offset + ulen > wbstate->len)
+ ulen = utf8decode(&u, (const unsigned char *) wbstate->str + wbstate->offset,
+ wbstate->len - wbstate->offset);
+
+ /* invalid UTF8 */
+ if (ulen <= 0)
{
+ wbstate->init = true;
wbstate->offset = wbstate->len;
- return wbstate->len;
+ return prev_offset;
}
- u = utf8_to_unicode((const unsigned char *) wbstate->str +
- wbstate->offset);
curr_alnum = pg_u_isalnum(u, wbstate->posix);
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
{
- size_t prev_offset = wbstate->offset;
-
wbstate->init = true;
wbstate->offset += ulen;
wbstate->prev_alnum = curr_alnum;
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index ae0f86ffa0c..71a6f2bbe70 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -179,7 +179,7 @@ test_icu(void)
{
pg_unicode_category category = unicode_category(code);
- if (category != PG_U_UNASSIGNED)
+ if (category != PG_U_UNASSIGNED && category != PG_U_SURROGATE)
{
uint8_t icu_category = u_charType(code);
char code_str[5] = {0};
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 4a692cfa249..d89f5ca4740 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -194,22 +194,6 @@ unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
NULL, NULL);
}
-/* local version of pg_utf_mblen() to be inlinable */
-static int
-utf8_mblen(const unsigned char *s)
-{
- if ((*s & 0x80) == 0)
- return 1;
- else if ((*s & 0xe0) == 0xc0)
- return 2;
- else if ((*s & 0xf0) == 0xe0)
- return 3;
- else if ((*s & 0xf8) == 0xf0)
- return 4;
- else
- return -1;
-}
-
/*
* Implement Unicode Default Case Conversion algorithm.
*
@@ -248,18 +232,19 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
while (srcoff < srclen)
{
- int u1len = utf8_mblen((const unsigned char *) src + srcoff);
char32_t u1;
+ int u1len;
char32_t simple = 0;
const char32_t *special = NULL;
enum CaseMapResult casemap_result;
+ u1len = utf8decode(&u1, (const unsigned char *) src + srcoff,
+ srclen - srcoff);
+
/* invalid UTF8 */
- if (u1len < 0 || srcoff + u1len > srclen)
+ if (u1len <= 0)
break;
- u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
-
if (str_casekind == CaseTitle)
{
if (srcoff == boundary)
@@ -280,6 +265,7 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
/* no mapping; copy bytes from src */
Assert(simple == 0);
Assert(special == NULL);
+
if (result_len + u1len <= dstsize)
memcpy(dst + result_len, src + srcoff, u1len);
@@ -289,11 +275,18 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
{
/* replace with single character */
char32_t u2 = simple;
- char32_t u2len = unicode_utf8len(u2);
+ int u2len;
+ size_t remaining = 0;
+ unsigned char *p = NULL;
+
+ if (dstsize > result_len)
+ {
+ remaining = dstsize - result_len;
+ p = (unsigned char *) dst + result_len;
+ }
Assert(special == NULL);
- if (result_len + u2len <= dstsize)
- unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+ u2len = utf8encode(p, remaining, u2);
result_len += u2len;
}
@@ -304,10 +297,17 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
{
char32_t u2 = special[i];
- size_t u2len = unicode_utf8len(u2);
+ int u2len;
+ size_t remaining = 0;
+ unsigned char *p = NULL;
- if (result_len + u2len <= dstsize)
- unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+ if (dstsize > result_len)
+ {
+ remaining = dstsize - result_len;
+ p = (unsigned char *) dst + result_len;
+ }
+
+ u2len = utf8encode(p, remaining, u2);
result_len += u2len;
}
@@ -344,15 +344,15 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- int u1len = utf8_mblen((const unsigned char *) str + i);
+ int u1len;
char32_t curr;
+ u1len = utf8decode(&curr, (const unsigned char *) str + i, len - i);
+
/* invalid UTF8 */
- if (u1len < 0 || i + u1len > len)
+ if (u1len <= 0)
return false;
- curr = utf8_to_unicode(str + i);
-
if (pg_u_prop_case_ignorable(curr))
continue;
else if (pg_u_prop_cased(curr))
@@ -373,15 +373,15 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- int u1len = utf8_mblen((const unsigned char *) str + i);
+ int u1len;
char32_t curr;
+ u1len = utf8decode(&curr, (const unsigned char *) str + i, len - i);
+
/* invalid UTF8 */
- if (u1len < 0 || i + u1len > len)
+ if (u1len <= 0)
return false;
- curr = utf8_to_unicode(str + i);
-
if (pg_u_prop_case_ignorable(curr))
continue;
else if (pg_u_prop_cased(curr))
@@ -391,8 +391,6 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
}
else if ((str[i] & 0xC0) == 0x80)
continue;
-
- Assert(false); /* invalid UTF-8 */
}
return true;
--
2.43.0