On Wed, 2026-06-24 at 16:44 +0800, Chao Li wrote:
> There is a compile warning against pg_wchar.h in 0004:
Fixed. I also used a loop in utf8decode() which is slightly smaller,
which is good if we intend it to be inlined by a lot of callers.
Regards,
Jeff Davis
From 9a9bfbc4f3866d77e48933df052086a2558e7263 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 22 Jun 2026 16:30:01 -0700
Subject: [PATCH v5 1/5] unicode_case.c: defend against invalid UTF8.
Reviewed-by: Chao Li <[email protected]>
Discussion: https://postgr.es/m/[email protected]
Backpatch-through: 17
---
src/backend/utils/adt/pg_locale_builtin.c | 24 ++++++++---
src/common/unicode/case_test.c | 8 ++++
src/common/unicode_case.c | 52 +++++++++++++++++++----
3 files changed, 70 insertions(+), 14 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 01d4f55b07e..7c36fd5091b 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -62,21 +62,33 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len)
{
- char32_t u = utf8_to_unicode((const unsigned char *) wbstate->str +
+ int ulen = pg_utf_mblen((const unsigned char *) wbstate->str +
wbstate->offset);
- bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
+ char32_t u;
+ bool curr_alnum;
+ size_t prev_offset = wbstate->offset;
- if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+ /* invalid UTF8 */
+ if (wbstate->offset + ulen > wbstate->len)
{
- size_t prev_offset = wbstate->offset;
+ wbstate->init = true;
+ wbstate->offset = wbstate->len;
+ return prev_offset;
+ }
+ u = utf8_to_unicode((const unsigned char *) wbstate->str +
+ wbstate->offset);
+ curr_alnum = pg_u_isalnum(u, wbstate->posix);
+
+ if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+ {
wbstate->init = true;
- wbstate->offset += unicode_utf8len(u);
+ wbstate->offset += ulen;
wbstate->prev_alnum = curr_alnum;
return prev_offset;
}
- wbstate->offset += unicode_utf8len(u);
+ wbstate->offset += ulen;
}
return wbstate->len;
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index a0dbf00b671..31ea94513bf 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -296,6 +296,8 @@ tfunc_fold(char *dst, size_t dstsize, const char *src,
static void
test_convert_case(void)
{
+ size_t needed;
+
/* test string with no case changes */
test_convert(tfunc_lower, "√∞", "√∞");
/* test adjust-to-cased behavior */
@@ -320,6 +322,12 @@ test_convert_case(void)
/* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
test_convert(tfunc_title, "\uFF11a", "\uFF11a");
+ /* invalid UTF8: truncated multibyte sequence */
+ needed = unicode_strfold(NULL, 0, "abc\xCE", 4, false);
+ Assert(needed == 3);
+ /* invalid UTF8: invalid byte */
+ needed = unicode_strfold(NULL, 0, "abc\xF8xyz", 7, false);
+ Assert(needed == 3);
#ifdef USE_ICU
icu_test_full("");
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index d6ee00b7d9c..42eb7d22211 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -189,6 +189,22 @@ unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
NULL);
}
+/* local version of pg_utf_mblen() to be inlinable */
+static int
+utf8_mblen(const unsigned char *s)
+{
+ if ((*s & 0x80) == 0)
+ return 1;
+ else if ((*s & 0xe0) == 0xc0)
+ return 2;
+ else if ((*s & 0xf0) == 0xe0)
+ return 3;
+ else if ((*s & 0xf8) == 0xf0)
+ return 4;
+ else
+ return -1;
+}
+
/*
* Implement Unicode Default Case Conversion algorithm.
*
@@ -227,12 +243,18 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
while (srcoff < srclen)
{
- char32_t u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
- int u1len = unicode_utf8len(u1);
+ int u1len = utf8_mblen((const unsigned char *) src + srcoff);
+ char32_t u1;
char32_t simple = 0;
const char32_t *special = NULL;
enum CaseMapResult casemap_result;
+ /* invalid UTF8 */
+ if (u1len < 0 || srcoff + u1len > srclen)
+ break;
+
+ u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
+
if (str_casekind == CaseTitle)
{
if (srcoff == boundary)
@@ -316,7 +338,14 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- char32_t curr = utf8_to_unicode(str + i);
+ int u1len = utf8_mblen((const unsigned char *) str + i);
+ char32_t curr;
+
+ /* invalid UTF8 */
+ if (u1len < 0 || i + u1len > len)
+ return false;
+
+ curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -327,8 +356,8 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
}
else if ((str[i] & 0xC0) == 0x80)
continue;
-
- Assert(false); /* invalid UTF-8 */
+ else
+ return false; /* invalid UTF8 */
}
/* end of string is not followed by a Cased character */
@@ -340,7 +369,14 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- char32_t curr = utf8_to_unicode(str + i);
+ int u1len = utf8_mblen((const unsigned char *) str + i);
+ char32_t curr;
+
+ /* invalid UTF8 */
+ if (u1len < 0 || i + u1len > len)
+ return false;
+
+ curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -351,8 +387,8 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
}
else if ((str[i] & 0xC0) == 0x80)
continue;
-
- Assert(false); /* invalid UTF-8 */
+ else
+ return false; /* invalid UTF8 */
}
return true;
--
2.43.0
From c389e8f3d47c51183702f468846c0ad8c33beaae Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Tue, 23 Jun 2026 17:09:49 -0700
Subject: [PATCH v5 2/5] pg_unicode_fast: fix final sigma logic.
If the string is preceded only by Case Ignorable characters, don't
consider it to be a final sigma.
In the process, refactor so that the preceding and following
characters are found first, and then the rule is applied, to improve
clarity.
Discussion: https://postgr.es/m/[email protected]
Backpatch-through: 18
---
src/common/unicode_case.c | 88 ++++++++++------------
src/test/regress/expected/collate.utf8.out | 6 ++
src/test/regress/sql/collate.utf8.sql | 1 +
3 files changed, 47 insertions(+), 48 deletions(-)
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 42eb7d22211..dd5b3ba86d0 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -323,75 +323,67 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
* 3-17. The character at the given offset must be directly preceded by a
* Cased character, and must not be directly followed by a Cased character.
*
- * Case_Ignorable characters are ignored. NB: some characters may be both
+ * Case_Ignorable characters are ignored. Neither beginning of string nor end
+ * of string are considered Cased characters. NB: some characters may be both
* Cased and Case_Ignorable, in which case they are ignored.
*/
static bool
check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
- /* the start of the string is not preceded by a Cased character */
- if (offset == 0)
- return false;
+ bool preceded_by_cased = false;
+ bool followed_by_cased = false;
+ char32_t curr;
+ int ulen;
- /* iterate backwards, looking for Cased character */
- for (int i = offset - 1; i >= 0; i--)
+ /* iterate backwards looking for preceding character */
+ for (int i = offset; i > 0;)
{
- if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
- {
- int u1len = utf8_mblen((const unsigned char *) str + i);
- char32_t curr;
+ /* skip backwards through continuation bytes */
+ i--;
+ if ((str[i] & 0xC0) == 0x80)
+ continue;
- /* invalid UTF8 */
- if (u1len < 0 || i + u1len > len)
- return false;
+ /* now at leading byte of previous sequence */
+ Assert((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0);
- curr = utf8_to_unicode(str + i);
+ ulen = utf8_mblen((const unsigned char *) str + i);
- if (pg_u_prop_case_ignorable(curr))
- continue;
- else if (pg_u_prop_cased(curr))
- break;
- else
- return false;
+ /* invalid UTF8 */
+ if (ulen < 0 || i + ulen > len)
+ return false;
+
+ curr = utf8_to_unicode((const unsigned char *) str + i);
+
+ if (!pg_u_prop_case_ignorable(curr))
+ {
+ preceded_by_cased = pg_u_prop_cased(curr);
+ break;
}
- else if ((str[i] & 0xC0) == 0x80)
- continue;
- else
- return false; /* invalid UTF8 */
}
- /* end of string is not followed by a Cased character */
- if (offset == len)
- return true;
+ ulen = utf8_mblen((const unsigned char *) str + offset);
- /* iterate forwards, looking for Cased character */
- for (int i = offset + 1; i < len && str[i] != '\0'; i++)
+ /* iterate forward looking for following character */
+ for (int i = offset + ulen; i < len;)
{
- if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
- {
- int u1len = utf8_mblen((const unsigned char *) str + i);
- char32_t curr;
+ ulen = utf8_mblen((const unsigned char *) str + i);
- /* invalid UTF8 */
- if (u1len < 0 || i + u1len > len)
- return false;
+ /* invalid UTF8 */
+ if (ulen < 0 || i + ulen > len)
+ return false;
- curr = utf8_to_unicode(str + i);
+ curr = utf8_to_unicode((const unsigned char *) str + i);
- if (pg_u_prop_case_ignorable(curr))
- continue;
- else if (pg_u_prop_cased(curr))
- return false;
- else
- break;
+ if (!pg_u_prop_case_ignorable(curr))
+ {
+ followed_by_cased = pg_u_prop_cased(curr);
+ break;
}
- else if ((str[i] & 0xC0) == 0x80)
- continue;
- else
- return false; /* invalid UTF8 */
+
+ i += ulen;
}
- return true;
+ return (preceded_by_cased && !followed_by_cased);
}
/*
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
index 0c3ab5c89b2..99fdc111fa4 100644
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -263,6 +263,12 @@ SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
ᾳσͅα
(1 row)
+SELECT lower(U&'\0300\03A3' COLLATE PG_UNICODE_FAST);
+ lower
+-------
+ ̀σ
+(1 row)
+
-- properties
SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
?column?
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
index d6d14220ab3..22aecee3a60 100644
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -128,6 +128,7 @@ SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3
SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391
SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391
SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
+SELECT lower(U&'\0300\03A3' COLLATE PG_UNICODE_FAST);
-- properties
--
2.43.0
From 7f2494e90f041cb6e118c0a6a062b9e5364bc799 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 22 Jun 2026 16:31:38 -0700
Subject: [PATCH v5 3/5] unicode_case.c: change API to signal UTF8 decoding
error.
Errors at this point are not expected, but if encountered, signal to
the caller so it can raise the appropriate error.
Discussion: https://postgr.es/m/[email protected]
---
src/backend/utils/adt/pg_locale_builtin.c | 50 +++++++++++++++++++----
src/common/unicode/case_test.c | 37 ++++++++++-------
src/common/unicode_case.c | 46 ++++++++++++---------
src/include/common/unicode_case.h | 8 ++--
4 files changed, 94 insertions(+), 47 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 7c36fd5091b..5619daf43c3 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -98,8 +98,16 @@ static size_t
strlower_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
pg_locale_t locale)
{
- return unicode_strlower(dest, destsize, src, srclen,
- locale->builtin.casemap_full);
+ size_t consumed;
+ size_t result;
+
+ result = unicode_strlower(dest, destsize, src, srclen, &consumed,
+ locale->builtin.casemap_full);
+ if (consumed < srclen)
+ report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+ srclen - consumed);
+
+ return result;
}
static size_t
@@ -114,26 +122,50 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
.init = false,
.prev_alnum = false,
};
+ size_t consumed;
+ size_t result;
- return unicode_strtitle(dest, destsize, src, srclen,
- locale->builtin.casemap_full,
- initcap_wbnext, &wbstate);
+ result = unicode_strtitle(dest, destsize, src, srclen, &consumed,
+ locale->builtin.casemap_full,
+ initcap_wbnext, &wbstate);
+
+ if (consumed < srclen)
+ report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+ srclen - consumed);
+
+ return result;
}
static size_t
strupper_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
pg_locale_t locale)
{
- return unicode_strupper(dest, destsize, src, srclen,
- locale->builtin.casemap_full);
+ size_t consumed;
+ size_t result;
+
+ result = unicode_strupper(dest, destsize, src, srclen, &consumed,
+ locale->builtin.casemap_full);
+ if (consumed < srclen)
+ report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+ srclen - consumed);
+
+ return result;
}
static size_t
strfold_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
pg_locale_t locale)
{
- return unicode_strfold(dest, destsize, src, srclen,
- locale->builtin.casemap_full);
+ size_t consumed;
+ size_t result;
+
+ result = unicode_strfold(dest, destsize, src, srclen, &consumed,
+ locale->builtin.casemap_full);
+ if (consumed < srclen)
+ report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+ srclen - consumed);
+
+ return result;
}
static bool
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index 31ea94513bf..08421d9e5ca 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -115,6 +115,7 @@ icu_test_full(char *str)
char icu_fold[BUFSZ];
UErrorCode status;
size_t len = strlen(str);
+ size_t consumed;
/* full case mapping doesn't use posix semantics */
struct WordBoundaryState wbstate = {
@@ -126,10 +127,10 @@ icu_test_full(char *str)
.prev_alnum = false,
};
- unicode_strlower(lower, BUFSZ, str, len, true);
- unicode_strtitle(title, BUFSZ, str, len, true, initcap_wbnext, &wbstate);
- unicode_strupper(upper, BUFSZ, str, len, true);
- unicode_strfold(fold, BUFSZ, str, len, true);
+ unicode_strlower(lower, BUFSZ, str, len, &consumed, true);
+ unicode_strtitle(title, BUFSZ, str, len, &consumed, true, initcap_wbnext, &wbstate);
+ unicode_strupper(upper, BUFSZ, str, len, &consumed, true);
+ unicode_strfold(fold, BUFSZ, str, len, &consumed, true);
status = U_ZERO_ERROR;
ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, len, &status);
status = U_ZERO_ERROR;
@@ -260,13 +261,16 @@ static size_t
tfunc_lower(char *dst, size_t dstsize, const char *src,
size_t srclen)
{
- return unicode_strlower(dst, dstsize, src, srclen, true);
+ size_t consumed;
+
+ return unicode_strlower(dst, dstsize, src, srclen, &consumed, true);
}
static size_t
tfunc_title(char *dst, size_t dstsize, const char *src,
size_t srclen)
{
+ size_t consumed;
struct WordBoundaryState wbstate = {
.str = src,
.len = srclen,
@@ -275,28 +279,33 @@ tfunc_title(char *dst, size_t dstsize, const char *src,
.prev_alnum = false,
};
- return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
- &wbstate);
+ return unicode_strtitle(dst, dstsize, src, srclen, &consumed, true,
+ initcap_wbnext, &wbstate);
}
static size_t
tfunc_upper(char *dst, size_t dstsize, const char *src,
size_t srclen)
{
- return unicode_strupper(dst, dstsize, src, srclen, true);
+ size_t consumed;
+
+ return unicode_strupper(dst, dstsize, src, srclen, &consumed, true);
}
static size_t
tfunc_fold(char *dst, size_t dstsize, const char *src,
size_t srclen)
{
- return unicode_strfold(dst, dstsize, src, srclen, true);
+ size_t consumed;
+
+ return unicode_strfold(dst, dstsize, src, srclen, &consumed, true);
}
static void
test_convert_case(void)
{
size_t needed;
+ size_t consumed;
/* test string with no case changes */
test_convert(tfunc_lower, "√∞", "√∞");
@@ -323,11 +332,11 @@ test_convert_case(void)
test_convert(tfunc_title, "\uFF11a", "\uFF11a");
/* invalid UTF8: truncated multibyte sequence */
- needed = unicode_strfold(NULL, 0, "abc\xCE", 4, false);
- Assert(needed == 3);
- /* invalid UTF8: invalid byte */
- needed = unicode_strfold(NULL, 0, "abc\xF8xyz", 7, false);
- Assert(needed == 3);
+ needed = unicode_strfold(NULL, 0, "abc\xCE", 4, &consumed, false);
+ Assert(needed == 3 && consumed == 3);
+ /* invalid UTF8: leading byte invalid length */
+ needed = unicode_strfold(NULL, 0, "abc\xF8xyz", 7, &consumed, false);
+ Assert(needed == 3 && consumed == 3);
#ifdef USE_ICU
icu_test_full("");
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index dd5b3ba86d0..24753aaab09 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -40,8 +40,8 @@ static const char32_t *const casekind_map[NCaseKind] =
static char32_t find_case_map(char32_t ucs, const char32_t *map);
static size_t convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
- CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
- void *wbstate);
+ size_t *pconsumed, CaseKind str_casekind, bool full,
+ WordBoundaryNext wbnext, void *wbstate);
static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
const char *src, size_t srclen, size_t srcoff,
char32_t *simple, const char32_t **special);
@@ -82,7 +82,8 @@ unicode_casefold_simple(char32_t code)
* unicode_strlower()
*
* Convert src to lowercase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
*
* String src must be encoded in UTF-8.
*
@@ -98,17 +99,18 @@ unicode_casefold_simple(char32_t code)
*/
size_t
unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen,
- bool full)
+ size_t *pconsumed, bool full)
{
- return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
- NULL);
+ return convert_case(dst, dstsize, src, srclen, pconsumed, CaseLower, full,
+ NULL, NULL);
}
/*
* unicode_strtitle()
*
* Convert src to titlecase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
*
* String src must be encoded in UTF-8.
*
@@ -134,17 +136,19 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen,
*/
size_t
unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen,
- bool full, WordBoundaryNext wbnext, void *wbstate)
+ size_t *pconsumed, bool full, WordBoundaryNext wbnext,
+ void *wbstate)
{
- return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
- wbstate);
+ return convert_case(dst, dstsize, src, srclen, pconsumed, CaseTitle, full,
+ wbnext, wbstate);
}
/*
* unicode_strupper()
*
* Convert src to uppercase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
*
* String src must be encoded in UTF-8.
*
@@ -160,17 +164,18 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen,
*/
size_t
unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen,
- bool full)
+ size_t *pconsumed, bool full)
{
- return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
- NULL);
+ return convert_case(dst, dstsize, src, srclen, pconsumed, CaseUpper, full,
+ NULL, NULL);
}
/*
* unicode_strfold()
*
* Case fold src, and return the result length (not including terminating
- * NUL).
+ * NUL). Sets *pconsumed to the amount of src successfully consumed; if less
+ * than srclen, indicates a decoding error.
*
* String src must be encoded in UTF-8.
*
@@ -183,10 +188,10 @@ unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen,
*/
size_t
unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
- bool full)
+ size_t *pconsumed, bool full)
{
- return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
- NULL);
+ return convert_case(dst, dstsize, src, srclen, pconsumed, CaseFold, full,
+ NULL, NULL);
}
/* local version of pg_utf_mblen() to be inlinable */
@@ -223,8 +228,8 @@ utf8_mblen(const unsigned char *s)
*/
static size_t
convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
- CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
- void *wbstate)
+ size_t *pconsumed, CaseKind str_casekind, bool full,
+ WordBoundaryNext wbnext, void *wbstate)
{
/* character CaseKind varies while titlecasing */
CaseKind chr_casekind = str_casekind;
@@ -315,6 +320,7 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
if (result_len < dstsize)
dst[result_len] = '\0';
+ *pconsumed = srcoff;
return result_len;
}
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h
index 03add78cabe..1cbc0c14bc2 100644
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -21,13 +21,13 @@ char32_t unicode_titlecase_simple(char32_t code);
char32_t unicode_uppercase_simple(char32_t code);
char32_t unicode_casefold_simple(char32_t code);
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
- size_t srclen, bool full);
+ size_t srclen, size_t *pconsumed, bool full);
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
- size_t srclen, bool full,
+ size_t srclen, size_t *pconsumed, bool full,
WordBoundaryNext wbnext, void *wbstate);
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
- size_t srclen, bool full);
+ size_t srclen, size_t *pconsumed, bool full);
size_t unicode_strfold(char *dst, size_t dstsize, const char *src,
- size_t srclen, bool full);
+ size_t srclen, size_t *pconsumed, bool full);
#endif /* UNICODE_CASE_H */
--
2.43.0
From 08acf6743051710355b6c8c6c9102a7922a3d163 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Thu, 4 Jun 2026 12:08:51 -0700
Subject: [PATCH v5 4/5] Validating, iterator-friendly UTF8 encoder/decoder
API.
Discussion: https://postgr.es/m/[email protected]
---
src/include/mb/pg_wchar.h | 139 +++++++++++++++++++++++++++++++++++++-
1 file changed, 137 insertions(+), 2 deletions(-)
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index deee2a832c3..bc445be0678 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -27,6 +27,11 @@
*/
typedef unsigned int pg_wchar;
+/*
+ * Returned for decoding failures in utf8decode() and utf8_to_unicode().
+ */
+#define PG_INVALID_CODEPOINT 0xFFFFFFFF
+
/*
* Maximum byte length of multibyte characters in any backend encoding
*/
@@ -392,11 +397,140 @@ surrogate_pair_to_codepoint(char16_t first, char16_t second)
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
}
+/*
+ * Encode the codepoint as UTF8 and return the number of bytes required. If
+ * the number of bytes required exceeds dstsize, just return the number of
+ * bytes required without modifying dst. If dstsize is zero, dst may be
+ * NULL. If codepoint is not a valid Unicode Scalar, return -1.
+ */
+static inline int
+utf8encode(unsigned char *dst, size_t dstsize, char32_t codepoint)
+{
+ int nbytes;
+
+ if (codepoint <= 0x7F)
+ nbytes = 1;
+ else if (codepoint <= 0x7FF)
+ nbytes = 2;
+ else if (codepoint <= 0xFFFF)
+ {
+ /* surrogate halves not valid for UTF8 */
+ if (codepoint >= 0xD800 && codepoint <= 0xDFFF)
+ return -1;
+ nbytes = 3;
+ }
+ else if (codepoint <= 0x10FFFF)
+ nbytes = 4;
+ else
+ return -1;
+
+ if ((size_t) nbytes > dstsize)
+ return nbytes;
+
+ if (codepoint <= 0x7F)
+ {
+ dst[0] = codepoint;
+ }
+ else if (codepoint <= 0x7FF)
+ {
+ dst[0] = 0xC0 | ((codepoint >> 6) & 0x1F);
+ dst[1] = 0x80 | (codepoint & 0x3F);
+ }
+ else if (codepoint <= 0xFFFF)
+ {
+ dst[0] = 0xE0 | ((codepoint >> 12) & 0x0F);
+ dst[1] = 0x80 | ((codepoint >> 6) & 0x3F);
+ dst[2] = 0x80 | (codepoint & 0x3F);
+ }
+ else if (codepoint <= 0x10FFFF)
+ {
+ dst[0] = 0xF0 | ((codepoint >> 18) & 0x07);
+ dst[1] = 0x80 | ((codepoint >> 12) & 0x3F);
+ dst[2] = 0x80 | ((codepoint >> 6) & 0x3F);
+ dst[3] = 0x80 | (codepoint & 0x3F);
+ }
+
+ return nbytes;
+}
+
+/*
+ * Decode the next Unicode codepoint from UTF8 at src, reading no more than
+ * srclen bytes (which must be at least 1). On success, *pcodepoint will be a
+ * valid Unicode Scalar; otherwise it will be set to PG_INVALID_CODEPOINT.
+ *
+ * Returns the number of bytes consumed. If srclen is not large enough
+ * (i.e. src is truncated in the middle of a sequence), returns 0. If invalid,
+ * returns -1.
+ */
+static inline int
+utf8decode(char32_t *pcodepoint, const unsigned char *src, size_t srclen)
+{
+ int nbytes;
+ char32_t codepoint;
+ char32_t min;
+
+ Assert(srclen >= 1);
+
+ if ((*src & 0x80) == 0)
+ {
+ *pcodepoint = (char32_t) src[0];
+ return 1;
+ }
+ else if ((*src & 0xe0) == 0xc0)
+ {
+ nbytes = 2;
+ min = 0x80;
+ codepoint = (char32_t) src[0] & 0x1f;
+ }
+ else if ((*src & 0xf0) == 0xe0)
+ {
+ nbytes = 3;
+ min = 0x800;
+ codepoint = (char32_t) src[0] & 0x0f;
+ }
+ else if ((*src & 0xf8) == 0xf0)
+ {
+ nbytes = 4;
+ min = 0x10000;
+ codepoint = (char32_t) src[0] & 0x07;
+ }
+ else
+ goto invalid; /* invalid leading byte */
+
+ /* truncated multibyte sequence */
+ if (srclen < (size_t) nbytes)
+ {
+ *pcodepoint = PG_INVALID_CODEPOINT;
+ return 0;
+ }
+
+ for (int i = 1; i < nbytes; i++)
+ {
+ if ((src[i] & 0xc0) != 0x80)
+ goto invalid;
+ codepoint = (codepoint << 6) | (src[i] & 0x3f);
+ }
+
+ /* reject overlong, surrogate, and out-of-range */
+ if (codepoint < min || codepoint > 0x10FFFF ||
+ (codepoint >= 0xD800 && codepoint <= 0xDFFF))
+ goto invalid;
+
+ *pcodepoint = codepoint;
+ return nbytes;
+
+invalid:
+ *pcodepoint = PG_INVALID_CODEPOINT;
+ return -1;
+}
+
/*
* Convert a UTF-8 character to a Unicode code point.
* This is a one-character version of pg_utf2wchar_with_len.
*
* No error checks here, c must point to a long-enough string.
+ *
+ * XXX: Callers should consider utf8decode() instead.
*/
static inline char32_t
utf8_to_unicode(const unsigned char *c)
@@ -416,13 +550,14 @@ utf8_to_unicode(const unsigned char *c)
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
else
- /* that is an invalid code on purpose */
- return 0xffffffff;
+ return PG_INVALID_CODEPOINT;
}
/*
* Map a Unicode code point to UTF-8. utf8string must have at least
* unicode_utf8len(c) bytes available.
+ *
+ * XXX: Callers should consider utf8encode() instead.
*/
static inline unsigned char *
unicode_to_utf8(char32_t c, unsigned char *utf8string)
--
2.43.0
From 91a0f3cac34d3a4030cd8adf8553261f33804d6d Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Fri, 19 Jun 2026 14:58:47 -0700
Subject: [PATCH v5 5/5] unicode_case.c: use new utf8encode/utf8decode APIs.
Discussion: https://postgr.es/m/[email protected]
---
src/backend/utils/adt/pg_locale_builtin.c | 10 +--
src/common/unicode/case_test.c | 45 +++++++++----
src/common/unicode_case.c | 77 +++++++++++------------
3 files changed, 77 insertions(+), 55 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 5619daf43c3..63516e7174f 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -62,22 +62,22 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len)
{
- int ulen = pg_utf_mblen((const unsigned char *) wbstate->str +
- wbstate->offset);
+ int ulen;
char32_t u;
bool curr_alnum;
size_t prev_offset = wbstate->offset;
+ ulen = utf8decode(&u, (const unsigned char *) wbstate->str + wbstate->offset,
+ wbstate->len - wbstate->offset);
+
/* invalid UTF8 */
- if (wbstate->offset + ulen > wbstate->len)
+ if (ulen <= 0)
{
wbstate->init = true;
wbstate->offset = wbstate->len;
return prev_offset;
}
- u = utf8_to_unicode((const unsigned char *) wbstate->str +
- wbstate->offset);
curr_alnum = pg_u_isalnum(u, wbstate->posix);
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index 08421d9e5ca..9461b56742b 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -52,24 +52,35 @@ initcap_wbnext(void *state)
{
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
- while (wbstate->offset < wbstate->len &&
- wbstate->str[wbstate->offset] != '\0')
+ while (wbstate->offset < wbstate->len)
{
- char32_t u = utf8_to_unicode((const unsigned char *) wbstate->str +
- wbstate->offset);
- bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
+ int ulen;
+ char32_t u;
+ bool curr_alnum;
+ size_t prev_offset = wbstate->offset;
- if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+ ulen = utf8decode(&u, (const unsigned char *) wbstate->str + wbstate->offset,
+ wbstate->len - wbstate->offset);
+
+ /* invalid UTF8 */
+ if (ulen <= 0)
{
- size_t prev_offset = wbstate->offset;
+ wbstate->init = true;
+ wbstate->offset = wbstate->len;
+ return prev_offset;
+ }
+
+ curr_alnum = pg_u_isalnum(u, wbstate->posix);
+ if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+ {
wbstate->init = true;
- wbstate->offset += unicode_utf8len(u);
+ wbstate->offset += ulen;
wbstate->prev_alnum = curr_alnum;
return prev_offset;
}
- wbstate->offset += unicode_utf8len(u);
+ wbstate->offset += ulen;
}
return wbstate->len;
@@ -179,7 +190,7 @@ test_icu(void)
{
pg_unicode_category category = unicode_category(code);
- if (category != PG_U_UNASSIGNED)
+ if (category != PG_U_UNASSIGNED && category != PG_U_SURROGATE)
{
uint8_t icu_category = u_charType(code);
char code_str[5] = {0};
@@ -191,7 +202,7 @@ test_icu(void)
}
icu_test_simple(code);
- unicode_to_utf8(code, (unsigned char *) code_str);
+ utf8encode((unsigned char *) code_str, 5, code);
icu_test_full(code_str);
successful++;
@@ -337,6 +348,18 @@ test_convert_case(void)
/* invalid UTF8: leading byte invalid length */
needed = unicode_strfold(NULL, 0, "abc\xF8xyz", 7, &consumed, false);
Assert(needed == 3 && consumed == 3);
+ /* invalid UTF8: surrogates */
+ needed = unicode_strfold(NULL, 0, "abc\xED\xA0\x81xyz", 7, &consumed, false);
+ Assert(needed == 3 && consumed == 3);
+ /* invalid UTF8: continuation with no leading byte */
+ needed = unicode_strfold(NULL, 0, "abc\x80xyz", 7, &consumed, false);
+ Assert(needed == 3 && consumed == 3);
+ /* invalid UTF8: out of range */
+ needed = unicode_strfold(NULL, 0, "abc\xF5\x80\x80\x80xyz", 7, &consumed, false);
+ Assert(needed == 3 && consumed == 3);
+ /* invalid UTF8: overlong */
+ needed = unicode_strfold(NULL, 0, "abc\xC1\xBFxyz", 7, &consumed, false);
+ Assert(needed == 3 && consumed == 3);
#ifdef USE_ICU
icu_test_full("");
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 24753aaab09..4d8ee71e8dc 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -194,22 +194,6 @@ unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
NULL, NULL);
}
-/* local version of pg_utf_mblen() to be inlinable */
-static int
-utf8_mblen(const unsigned char *s)
-{
- if ((*s & 0x80) == 0)
- return 1;
- else if ((*s & 0xe0) == 0xc0)
- return 2;
- else if ((*s & 0xf0) == 0xe0)
- return 3;
- else if ((*s & 0xf8) == 0xf0)
- return 4;
- else
- return -1;
-}
-
/*
* Implement Unicode Default Case Conversion algorithm.
*
@@ -248,18 +232,19 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
while (srcoff < srclen)
{
- int u1len = utf8_mblen((const unsigned char *) src + srcoff);
char32_t u1;
+ int u1len;
char32_t simple = 0;
const char32_t *special = NULL;
enum CaseMapResult casemap_result;
+ u1len = utf8decode(&u1, (const unsigned char *) src + srcoff,
+ srclen - srcoff);
+
/* invalid UTF8 */
- if (u1len < 0 || srcoff + u1len > srclen)
+ if (u1len <= 0)
break;
- u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
-
if (str_casekind == CaseTitle)
{
if (srcoff == boundary)
@@ -280,6 +265,7 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
/* no mapping; copy bytes from src */
Assert(simple == 0);
Assert(special == NULL);
+
if (result_len + u1len <= dstsize)
memcpy(dst + result_len, src + srcoff, u1len);
@@ -289,11 +275,19 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
{
/* replace with single character */
char32_t u2 = simple;
- char32_t u2len = unicode_utf8len(u2);
+ int u2len;
+ size_t remaining = 0;
+ unsigned char *p = NULL;
+
+ if (dstsize > result_len)
+ {
+ remaining = dstsize - result_len;
+ p = (unsigned char *) dst + result_len;
+ }
Assert(special == NULL);
- if (result_len + u2len <= dstsize)
- unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+ u2len = utf8encode(p, remaining, u2);
+ Assert(u2len > 0);
result_len += u2len;
}
@@ -304,10 +298,18 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
{
char32_t u2 = special[i];
- size_t u2len = unicode_utf8len(u2);
+ int u2len;
+ size_t remaining = 0;
+ unsigned char *p = NULL;
+
+ if (dstsize > result_len)
+ {
+ remaining = dstsize - result_len;
+ p = (unsigned char *) dst + result_len;
+ }
- if (result_len + u2len <= dstsize)
- unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+ u2len = utf8encode(p, remaining, u2);
+ Assert(u2len > 0);
result_len += u2len;
}
@@ -352,13 +354,10 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
/* now at leading byte of previous sequence */
Assert((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0);
- ulen = utf8_mblen((const unsigned char *) str + i);
-
- /* invalid UTF8 */
- if (ulen < 0 || i + ulen > len)
- return false;
+ ulen = utf8decode(&curr, (const unsigned char *) str + i, len - i);
- curr = utf8_to_unicode((const unsigned char *) str + i);
+ if (ulen <= 0)
+ return false; /* invalid UTF8 */
if (!pg_u_prop_case_ignorable(curr))
{
@@ -367,18 +366,18 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
}
}
- ulen = utf8_mblen((const unsigned char *) str + offset);
+ ulen = utf8decode(&curr, (const unsigned char *) str + offset,
+ len - offset);
+ if (ulen <= 0)
+ return false; /* invalid UTF8 */
/* iterate forward looking for following character */
for (int i = offset + ulen; i < len;)
{
- ulen = utf8_mblen((const unsigned char *) str + i);
-
- /* invalid UTF8 */
- if (ulen < 0 || i + ulen > len)
- return false;
+ ulen = utf8decode(&curr, (const unsigned char *) str + i, len - i);
- curr = utf8_to_unicode((const unsigned char *) str + i);
+ if (ulen <= 0)
+ return false; /* invalid UTF8 */
if (!pg_u_prop_case_ignorable(curr))
{
--
2.43.0