On Sun, 2025-12-14 at 07:22 +0800, Chao Li wrote:
> This patch adds a length check to utf8_to_unicode(), I think which is
> where “safety” comes from. Can you please add an a little bit more to
> the commit message instead of only saying “improve safety”.
Right: it does not read past pg_mblen(), nor past the supplied length.
We could go further and check for valid continuation bytes, but the
other routines don't do that.
> It also deleted two redundant function declarations from pg_wchar.h,
> which may also worth a quick note in the commit message.
I committed that as an independent change and removed it from this
patch.
> + /* Assume byte sequence has not been broken. */
> + c = utf8_to_unicode(str, MAX_MULTIBYTE_CHAR_LEN);
> ```
>
> Here we need an empty line above the new comment.
Done, and I expanded the comment to explain why it's safe.
> pg_utf_dsplen(const unsigned char *s)
> {
> - return ucs_wcwidth(utf8_to_unicode(s));
> + /* trust that input is not a truncated byte sequence */
> + return ucs_wcwidth(utf8_to_unicode(s,
> MAX_MULTIBYTE_CHAR_LEN));
> }
> ```
>
> For the new comment, as a code reader, I wonder why we “trust” that?
We could use strlen(), but I was concerned that it might be used for
string fragments that aren't NUL-terminated, because it's intended for
a single character. A caller might reasonably assume that it wouldn't
read past pg_mblen().
So I changed the comment slightly to just say that it requires the
input is a valid UTF-8 sequence. Let me know if you have another
suggestion.
Regards,
Jeff Davis
From 94a80385eec23dbb85cc601063cac04e8d845a52 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Thu, 4 Dec 2025 19:37:32 -0800
Subject: [PATCH v2] Make utf8_to_unicode() safer.
Require the input string length so that it doesn't read past the end
of a string if given an invalid (or truncated) byte sequence.
Discussion: https://postgr.es/m/[email protected]
---
contrib/fuzzystrmatch/daitch_mokotoff.c | 8 ++++++-
src/backend/utils/adt/pg_locale_builtin.c | 5 +++--
src/backend/utils/adt/varlena.c | 26 +++++++++++------------
src/common/saslprep.c | 4 +++-
src/common/unicode/case_test.c | 5 +++--
src/common/unicode_case.c | 12 +++++++----
src/common/wchar.c | 3 ++-
src/include/mb/pg_wchar.h | 12 +++++------
8 files changed, 43 insertions(+), 32 deletions(-)
diff --git a/contrib/fuzzystrmatch/daitch_mokotoff.c b/contrib/fuzzystrmatch/daitch_mokotoff.c
index 07f895ae2bf..103a29d89d4 100644
--- a/contrib/fuzzystrmatch/daitch_mokotoff.c
+++ b/contrib/fuzzystrmatch/daitch_mokotoff.c
@@ -401,7 +401,13 @@ read_char(const unsigned char *str, int *ix)
/* Decode UTF-8 character to ISO 10646 code point. */
str += *ix;
- c = utf8_to_unicode(str);
+
+ /*
+ * We assume the string does not contain truncated byte sequences because
+ * it came from pg_server_to_any(..., PG_UTF8), so we can safely use
+ * MAX_MULTIBYTE_CHAR_LEN.
+ */
+ c = utf8_to_unicode(str, MAX_MULTIBYTE_CHAR_LEN);
/* Advance *ix, but (for safety) not if we've reached end of string. */
if (c)
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 0c2920112bb..28fc55278b0 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -59,12 +59,13 @@ static size_t
initcap_wbnext(void *state)
{
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+ unsigned char *str = (unsigned char *) wbstate->str;
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
- char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
- wbstate->offset);
+ char32_t u = utf8_to_unicode(str + wbstate->offset,
+ wbstate->len - wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index baa5b44ea8d..d684370900d 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -5423,19 +5423,17 @@ Datum
unicode_assigned(PG_FUNCTION_ARGS)
{
text *input = PG_GETARG_TEXT_PP(0);
- unsigned char *p;
- int size;
+ unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+ unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
if (GetDatabaseEncoding() != PG_UTF8)
ereport(ERROR,
(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
/* convert to char32_t */
- size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
- p = (unsigned char *) VARDATA_ANY(input);
- for (int i = 0; i < size; i++)
+ while (p < p_end)
{
- char32_t uchar = utf8_to_unicode(p);
+ char32_t uchar = utf8_to_unicode(p, p_end - p);
int category = unicode_category(uchar);
if (category == PG_U_UNASSIGNED)
@@ -5456,7 +5454,8 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
int size;
char32_t *input_chars;
char32_t *output_chars;
- unsigned char *p;
+ unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+ unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
text *result;
int i;
@@ -5465,14 +5464,13 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
/* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
input_chars = palloc((size + 1) * sizeof(char32_t));
- p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
- input_chars[i] = utf8_to_unicode(p);
+ input_chars[i] = utf8_to_unicode(p, p_end - p);
p += pg_utf_mblen(p);
}
input_chars[i] = (char32_t) '\0';
- Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+ Assert(p == p_end);
/* action */
output_chars = unicode_normalize(form, input_chars);
@@ -5522,7 +5520,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
int size;
char32_t *input_chars;
char32_t *output_chars;
- unsigned char *p;
+ unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+ unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
int i;
UnicodeNormalizationQC quickcheck;
int output_size;
@@ -5533,14 +5532,13 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
/* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
input_chars = palloc((size + 1) * sizeof(char32_t));
- p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
- input_chars[i] = utf8_to_unicode(p);
+ input_chars[i] = utf8_to_unicode(p, p_end - p);
p += pg_utf_mblen(p);
}
input_chars[i] = (char32_t) '\0';
- Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+ Assert(p == p_end);
/* quick check (see UAX #15) */
quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 101e8d65a4d..083702654b0 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -1055,6 +1055,7 @@ pg_saslprep(const char *input, char **output)
int i;
bool contains_RandALCat;
unsigned char *p;
+ unsigned char *p_end;
char32_t *wp;
/* Ensure we return *output as NULL on failure */
@@ -1088,9 +1089,10 @@ pg_saslprep(const char *input, char **output)
goto oom;
p = (unsigned char *) input;
+ p_end = p + strlen(input);
for (i = 0; i < input_size; i++)
{
- input_chars[i] = utf8_to_unicode(p);
+ input_chars[i] = utf8_to_unicode(p, p_end - p);
p += pg_utf_mblen(p);
}
input_chars[i] = (char32_t) '\0';
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index 00d4f85e5a5..e63ce2fbeb9 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -51,12 +51,13 @@ static size_t
initcap_wbnext(void *state)
{
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+ unsigned char *str = (unsigned char *) wbstate->str;
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
- char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
- wbstate->offset);
+ char32_t u = utf8_to_unicode(str + wbstate->offset,
+ wbstate->len - wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index e5e494db43c..a760094104c 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -223,15 +223,19 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
(str_casekind != CaseTitle && !wbnext && !wbstate));
+ if (srclen < 0)
+ srclen = strlen(src);
+
if (str_casekind == CaseTitle)
{
boundary = wbnext(wbstate);
Assert(boundary == 0); /* start of text is always a boundary */
}
- while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
+ while (srcoff < srclen)
{
- char32_t u1 = utf8_to_unicode((unsigned char *) src + srcoff);
+ char32_t u1 = utf8_to_unicode((unsigned char *) src + srcoff,
+ srclen - srcoff);
int u1len = unicode_utf8len(u1);
char32_t simple = 0;
const char32_t *special = NULL;
@@ -320,7 +324,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- char32_t curr = utf8_to_unicode(str + i);
+ char32_t curr = utf8_to_unicode(str + i, len - i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -344,7 +348,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- char32_t curr = utf8_to_unicode(str + i);
+ char32_t curr = utf8_to_unicode(str + i, len - i);
if (pg_u_prop_case_ignorable(curr))
continue;
diff --git a/src/common/wchar.c b/src/common/wchar.c
index a4bc29921de..a104234ae92 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -661,7 +661,8 @@ ucs_wcwidth(pg_wchar ucs)
static int
pg_utf_dsplen(const unsigned char *s)
{
- return ucs_wcwidth(utf8_to_unicode(s));
+ /* input must be a valid UTF-8 sequence */
+ return ucs_wcwidth(utf8_to_unicode(s, MAX_MULTIBYTE_CHAR_LEN));
}
/*
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 5df67ceea87..6dc0fff332f 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -558,22 +558,20 @@ surrogate_pair_to_codepoint(char16_t first, char16_t second)
/*
* Convert a UTF-8 character to a Unicode code point.
* This is a one-character version of pg_utf2wchar_with_len.
- *
- * No error checks here, c must point to a long-enough string.
*/
static inline char32_t
-utf8_to_unicode(const unsigned char *c)
+utf8_to_unicode(const unsigned char *c, size_t len)
{
- if ((*c & 0x80) == 0)
+ if ((*c & 0x80) == 0 && len >= 1)
return (char32_t) c[0];
- else if ((*c & 0xe0) == 0xc0)
+ else if ((*c & 0xe0) == 0xc0 && len >= 2)
return (char32_t) (((c[0] & 0x1f) << 6) |
(c[1] & 0x3f));
- else if ((*c & 0xf0) == 0xe0)
+ else if ((*c & 0xf0) == 0xe0 && len >= 3)
return (char32_t) (((c[0] & 0x0f) << 12) |
((c[1] & 0x3f) << 6) |
(c[2] & 0x3f));
- else if ((*c & 0xf8) == 0xf0)
+ else if ((*c & 0xf8) == 0xf0 && len >= 4)
return (char32_t) (((c[0] & 0x07) << 18) |
((c[1] & 0x3f) << 12) |
((c[2] & 0x3f) << 6) |
--
2.43.0