Attached.
From 4777d1af75bc9e570e7db954b6642916fa1420bc Mon Sep 17 00:00:00 2001 From: Jeff Davis <[email protected]> Date: Thu, 4 Dec 2025 19:37:32 -0800 Subject: [PATCH v1] Make utf8_to_unicode() safer. --- contrib/fuzzystrmatch/daitch_mokotoff.c | 3 ++- src/backend/utils/adt/pg_locale_builtin.c | 5 +++-- src/backend/utils/adt/varlena.c | 26 +++++++++++------------ src/common/saslprep.c | 4 +++- src/common/unicode/case_test.c | 5 +++-- src/common/unicode_case.c | 12 +++++++---- src/common/wchar.c | 3 ++- src/include/mb/pg_wchar.h | 14 +++++------- 8 files changed, 38 insertions(+), 34 deletions(-) diff --git a/contrib/fuzzystrmatch/daitch_mokotoff.c b/contrib/fuzzystrmatch/daitch_mokotoff.c index 07f895ae2bf..47bd2814460 100644 --- a/contrib/fuzzystrmatch/daitch_mokotoff.c +++ b/contrib/fuzzystrmatch/daitch_mokotoff.c @@ -401,7 +401,8 @@ read_char(const unsigned char *str, int *ix) /* Decode UTF-8 character to ISO 10646 code point. */ str += *ix; - c = utf8_to_unicode(str); + /* Assume byte sequence has not been broken. */ + c = utf8_to_unicode(str, MAX_MULTIBYTE_CHAR_LEN); /* Advance *ix, but (for safety) not if we've reached end of string. */ if (c) diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 0d4c754a267..5c1358e5347 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -59,12 +59,13 @@ static size_t initcap_wbnext(void *state) { struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state; + unsigned char *str = (unsigned char *) wbstate->str; while (wbstate->offset < wbstate->len && wbstate->str[wbstate->offset] != '\0') { - char32_t u = utf8_to_unicode((unsigned char *) wbstate->str + - wbstate->offset); + char32_t u = utf8_to_unicode(str + wbstate->offset, + wbstate->len - wbstate->offset); bool curr_alnum = pg_u_isalnum(u, wbstate->posix); if (!wbstate->init || curr_alnum != wbstate->prev_alnum) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index baa5b44ea8d..d684370900d 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -5423,19 +5423,17 @@ Datum unicode_assigned(PG_FUNCTION_ARGS) { text *input = PG_GETARG_TEXT_PP(0); - unsigned char *p; - int size; + unsigned char *p = (unsigned char *) VARDATA_ANY(input); + unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input); if (GetDatabaseEncoding() != PG_UTF8) ereport(ERROR, (errmsg("Unicode categorization can only be performed if server encoding is UTF8"))); /* convert to char32_t */ - size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); - p = (unsigned char *) VARDATA_ANY(input); - for (int i = 0; i < size; i++) + while (p < p_end) { - char32_t uchar = utf8_to_unicode(p); + char32_t uchar = utf8_to_unicode(p, p_end - p); int category = unicode_category(uchar); if (category == PG_U_UNASSIGNED) @@ -5456,7 +5454,8 @@ unicode_normalize_func(PG_FUNCTION_ARGS) int size; char32_t *input_chars; char32_t *output_chars; - unsigned char *p; + unsigned char *p = (unsigned char *) VARDATA_ANY(input); + unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input); text *result; int i; @@ -5465,14 +5464,13 @@ unicode_normalize_func(PG_FUNCTION_ARGS) /* convert to char32_t */ size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); input_chars = palloc((size + 1) * sizeof(char32_t)); - p = (unsigned char *) VARDATA_ANY(input); for (i = 0; i < size; i++) { - input_chars[i] = utf8_to_unicode(p); + input_chars[i] = utf8_to_unicode(p, p_end - p); p += pg_utf_mblen(p); } input_chars[i] = (char32_t) '\0'; - Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); + Assert(p == p_end); /* action */ output_chars = unicode_normalize(form, input_chars); @@ -5522,7 +5520,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS) int size; char32_t *input_chars; char32_t *output_chars; - unsigned char *p; + unsigned char *p = (unsigned char *) VARDATA_ANY(input); + unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input); int i; UnicodeNormalizationQC quickcheck; int output_size; @@ -5533,14 +5532,13 @@ unicode_is_normalized(PG_FUNCTION_ARGS) /* convert to char32_t */ size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); input_chars = palloc((size + 1) * sizeof(char32_t)); - p = (unsigned char *) VARDATA_ANY(input); for (i = 0; i < size; i++) { - input_chars[i] = utf8_to_unicode(p); + input_chars[i] = utf8_to_unicode(p, p_end - p); p += pg_utf_mblen(p); } input_chars[i] = (char32_t) '\0'; - Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); + Assert(p == p_end); /* quick check (see UAX #15) */ quickcheck = unicode_is_normalized_quickcheck(form, input_chars); diff --git a/src/common/saslprep.c b/src/common/saslprep.c index 101e8d65a4d..083702654b0 100644 --- a/src/common/saslprep.c +++ b/src/common/saslprep.c @@ -1055,6 +1055,7 @@ pg_saslprep(const char *input, char **output) int i; bool contains_RandALCat; unsigned char *p; + unsigned char *p_end; char32_t *wp; /* Ensure we return *output as NULL on failure */ @@ -1088,9 +1089,10 @@ pg_saslprep(const char *input, char **output) goto oom; p = (unsigned char *) input; + p_end = p + strlen(input); for (i = 0; i < input_size; i++) { - input_chars[i] = utf8_to_unicode(p); + input_chars[i] = utf8_to_unicode(p, p_end - p); p += pg_utf_mblen(p); } input_chars[i] = (char32_t) '\0'; diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c index 00d4f85e5a5..e63ce2fbeb9 100644 --- a/src/common/unicode/case_test.c +++ b/src/common/unicode/case_test.c @@ -51,12 +51,13 @@ static size_t initcap_wbnext(void *state) { struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state; + unsigned char *str = (unsigned char *) wbstate->str; while (wbstate->offset < wbstate->len && wbstate->str[wbstate->offset] != '\0') { - char32_t u = utf8_to_unicode((unsigned char *) wbstate->str + - wbstate->offset); + char32_t u = utf8_to_unicode(str + wbstate->offset, + wbstate->len - wbstate->offset); bool curr_alnum = pg_u_isalnum(u, wbstate->posix); if (!wbstate->init || curr_alnum != wbstate->prev_alnum) diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c index e5e494db43c..a760094104c 100644 --- a/src/common/unicode_case.c +++ b/src/common/unicode_case.c @@ -223,15 +223,19 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, Assert((str_casekind == CaseTitle && wbnext && wbstate) || (str_casekind != CaseTitle && !wbnext && !wbstate)); + if (srclen < 0) + srclen = strlen(src); + if (str_casekind == CaseTitle) { boundary = wbnext(wbstate); Assert(boundary == 0); /* start of text is always a boundary */ } - while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0') + while (srcoff < srclen) { - char32_t u1 = utf8_to_unicode((unsigned char *) src + srcoff); + char32_t u1 = utf8_to_unicode((unsigned char *) src + srcoff, + srclen - srcoff); int u1len = unicode_utf8len(u1); char32_t simple = 0; const char32_t *special = NULL; @@ -320,7 +324,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset) { if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0) { - char32_t curr = utf8_to_unicode(str + i); + char32_t curr = utf8_to_unicode(str + i, len - i); if (pg_u_prop_case_ignorable(curr)) continue; @@ -344,7 +348,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset) { if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0) { - char32_t curr = utf8_to_unicode(str + i); + char32_t curr = utf8_to_unicode(str + i, len - i); if (pg_u_prop_case_ignorable(curr)) continue; diff --git a/src/common/wchar.c b/src/common/wchar.c index a4bc29921de..c113cadf815 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -661,7 +661,8 @@ ucs_wcwidth(pg_wchar ucs) static int pg_utf_dsplen(const unsigned char *s) { - return ucs_wcwidth(utf8_to_unicode(s)); + /* trust that input is not a truncated byte sequence */ + return ucs_wcwidth(utf8_to_unicode(s, MAX_MULTIBYTE_CHAR_LEN)); } /* diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 4d84bdc81e4..6dc0fff332f 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -558,22 +558,20 @@ surrogate_pair_to_codepoint(char16_t first, char16_t second) /* * Convert a UTF-8 character to a Unicode code point. * This is a one-character version of pg_utf2wchar_with_len. - * - * No error checks here, c must point to a long-enough string. */ static inline char32_t -utf8_to_unicode(const unsigned char *c) +utf8_to_unicode(const unsigned char *c, size_t len) { - if ((*c & 0x80) == 0) + if ((*c & 0x80) == 0 && len >= 1) return (char32_t) c[0]; - else if ((*c & 0xe0) == 0xc0) + else if ((*c & 0xe0) == 0xc0 && len >= 2) return (char32_t) (((c[0] & 0x1f) << 6) | (c[1] & 0x3f)); - else if ((*c & 0xf0) == 0xe0) + else if ((*c & 0xf0) == 0xe0 && len >= 3) return (char32_t) (((c[0] & 0x0f) << 12) | ((c[1] & 0x3f) << 6) | (c[2] & 0x3f)); - else if ((*c & 0xf8) == 0xf0) + else if ((*c & 0xf8) == 0xf0 && len >= 4) return (char32_t) (((c[0] & 0x07) << 18) | ((c[1] & 0x3f) << 12) | ((c[2] & 0x3f) << 6) | @@ -676,8 +674,6 @@ extern int pg_valid_server_encoding(const char *name); extern bool is_encoding_supported_by_icu(int encoding); extern const char *get_encoding_name_for_icu(int encoding); -extern unsigned char *unicode_to_utf8(char32_t c, unsigned char *utf8string); -extern char32_t utf8_to_unicode(const unsigned char *c); extern bool pg_utf8_islegal(const unsigned char *source, int length); extern int pg_utf_mblen(const unsigned char *s); extern int pg_mule_mblen(const unsigned char *s); -- 2.43.0
