Small patch to improve safety of utf8_to_unicode().

Jeff Davis Fri, 12 Dec 2025 15:24:29 -0800

Attached.

From 4777d1af75bc9e570e7db954b6642916fa1420bc Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Thu, 4 Dec 2025 19:37:32 -0800
Subject: [PATCH v1] Make utf8_to_unicode() safer.

---
 contrib/fuzzystrmatch/daitch_mokotoff.c   |  3 ++-
 src/backend/utils/adt/pg_locale_builtin.c |  5 +++--
 src/backend/utils/adt/varlena.c           | 26 +++++++++++------------
 src/common/saslprep.c                     |  4 +++-
 src/common/unicode/case_test.c            |  5 +++--
 src/common/unicode_case.c                 | 12 +++++++----
 src/common/wchar.c                        |  3 ++-
 src/include/mb/pg_wchar.h                 | 14 +++++-------
 8 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/contrib/fuzzystrmatch/daitch_mokotoff.c b/contrib/fuzzystrmatch/daitch_mokotoff.c
index 07f895ae2bf..47bd2814460 100644
--- a/contrib/fuzzystrmatch/daitch_mokotoff.c
+++ b/contrib/fuzzystrmatch/daitch_mokotoff.c
@@ -401,7 +401,8 @@ read_char(const unsigned char *str, int *ix)
 
 	/* Decode UTF-8 character to ISO 10646 code point. */
 	str += *ix;
-	c = utf8_to_unicode(str);
+	/* Assume byte sequence has not been broken. */
+	c = utf8_to_unicode(str, MAX_MULTIBYTE_CHAR_LEN);
 
 	/* Advance *ix, but (for safety) not if we've reached end of string. */
 	if (c)
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 0d4c754a267..5c1358e5347 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -59,12 +59,13 @@ static size_t
 initcap_wbnext(void *state)
 {
 	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+	unsigned char *str = (unsigned char *) wbstate->str;
 
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
-		char32_t	u = utf8_to_unicode((unsigned char *) wbstate->str +
-										wbstate->offset);
+		char32_t	u = utf8_to_unicode(str + wbstate->offset,
+										wbstate->len - wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index baa5b44ea8d..d684370900d 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -5423,19 +5423,17 @@ Datum
 unicode_assigned(PG_FUNCTION_ARGS)
 {
 	text	   *input = PG_GETARG_TEXT_PP(0);
-	unsigned char *p;
-	int			size;
+	unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+	unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
 
 	if (GetDatabaseEncoding() != PG_UTF8)
 		ereport(ERROR,
 				(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
 
 	/* convert to char32_t */
-	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
-	p = (unsigned char *) VARDATA_ANY(input);
-	for (int i = 0; i < size; i++)
+	while (p < p_end)
 	{
-		char32_t	uchar = utf8_to_unicode(p);
+		char32_t	uchar = utf8_to_unicode(p, p_end - p);
 		int			category = unicode_category(uchar);
 
 		if (category == PG_U_UNASSIGNED)
@@ -5456,7 +5454,8 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 	int			size;
 	char32_t   *input_chars;
 	char32_t   *output_chars;
-	unsigned char *p;
+	unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+	unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
 	text	   *result;
 	int			i;
 
@@ -5465,14 +5464,13 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
 	input_chars = palloc((size + 1) * sizeof(char32_t));
-	p = (unsigned char *) VARDATA_ANY(input);
 	for (i = 0; i < size; i++)
 	{
-		input_chars[i] = utf8_to_unicode(p);
+		input_chars[i] = utf8_to_unicode(p, p_end - p);
 		p += pg_utf_mblen(p);
 	}
 	input_chars[i] = (char32_t) '\0';
-	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+	Assert(p == p_end);
 
 	/* action */
 	output_chars = unicode_normalize(form, input_chars);
@@ -5522,7 +5520,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 	int			size;
 	char32_t   *input_chars;
 	char32_t   *output_chars;
-	unsigned char *p;
+	unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+	unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
 	int			i;
 	UnicodeNormalizationQC quickcheck;
 	int			output_size;
@@ -5533,14 +5532,13 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
 	input_chars = palloc((size + 1) * sizeof(char32_t));
-	p = (unsigned char *) VARDATA_ANY(input);
 	for (i = 0; i < size; i++)
 	{
-		input_chars[i] = utf8_to_unicode(p);
+		input_chars[i] = utf8_to_unicode(p, p_end - p);
 		p += pg_utf_mblen(p);
 	}
 	input_chars[i] = (char32_t) '\0';
-	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+	Assert(p == p_end);
 
 	/* quick check (see UAX #15) */
 	quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 101e8d65a4d..083702654b0 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -1055,6 +1055,7 @@ pg_saslprep(const char *input, char **output)
 	int			i;
 	bool		contains_RandALCat;
 	unsigned char *p;
+	unsigned char *p_end;
 	char32_t   *wp;
 
 	/* Ensure we return *output as NULL on failure */
@@ -1088,9 +1089,10 @@ pg_saslprep(const char *input, char **output)
 		goto oom;
 
 	p = (unsigned char *) input;
+	p_end = p + strlen(input);
 	for (i = 0; i < input_size; i++)
 	{
-		input_chars[i] = utf8_to_unicode(p);
+		input_chars[i] = utf8_to_unicode(p, p_end - p);
 		p += pg_utf_mblen(p);
 	}
 	input_chars[i] = (char32_t) '\0';
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index 00d4f85e5a5..e63ce2fbeb9 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -51,12 +51,13 @@ static size_t
 initcap_wbnext(void *state)
 {
 	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+	unsigned char *str = (unsigned char *) wbstate->str;
 
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
-		char32_t	u = utf8_to_unicode((unsigned char *) wbstate->str +
-										wbstate->offset);
+		char32_t	u = utf8_to_unicode(str + wbstate->offset,
+										wbstate->len - wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index e5e494db43c..a760094104c 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -223,15 +223,19 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 	Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
 		   (str_casekind != CaseTitle && !wbnext && !wbstate));
 
+	if (srclen < 0)
+		srclen = strlen(src);
+
 	if (str_casekind == CaseTitle)
 	{
 		boundary = wbnext(wbstate);
 		Assert(boundary == 0);	/* start of text is always a boundary */
 	}
 
-	while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
+	while (srcoff < srclen)
 	{
-		char32_t	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
+		char32_t	u1 = utf8_to_unicode((unsigned char *) src + srcoff,
+										 srclen - srcoff);
 		int			u1len = unicode_utf8len(u1);
 		char32_t	simple = 0;
 		const char32_t *special = NULL;
@@ -320,7 +324,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			char32_t	curr = utf8_to_unicode(str + i);
+			char32_t	curr = utf8_to_unicode(str + i, len - i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
@@ -344,7 +348,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			char32_t	curr = utf8_to_unicode(str + i);
+			char32_t	curr = utf8_to_unicode(str + i, len - i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
diff --git a/src/common/wchar.c b/src/common/wchar.c
index a4bc29921de..c113cadf815 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -661,7 +661,8 @@ ucs_wcwidth(pg_wchar ucs)
 static int
 pg_utf_dsplen(const unsigned char *s)
 {
-	return ucs_wcwidth(utf8_to_unicode(s));
+	/* trust that input is not a truncated byte sequence */
+	return ucs_wcwidth(utf8_to_unicode(s, MAX_MULTIBYTE_CHAR_LEN));
 }
 
 /*
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 4d84bdc81e4..6dc0fff332f 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -558,22 +558,20 @@ surrogate_pair_to_codepoint(char16_t first, char16_t second)
 /*
  * Convert a UTF-8 character to a Unicode code point.
  * This is a one-character version of pg_utf2wchar_with_len.
- *
- * No error checks here, c must point to a long-enough string.
  */
 static inline char32_t
-utf8_to_unicode(const unsigned char *c)
+utf8_to_unicode(const unsigned char *c, size_t len)
 {
-	if ((*c & 0x80) == 0)
+	if ((*c & 0x80) == 0 && len >= 1)
 		return (char32_t) c[0];
-	else if ((*c & 0xe0) == 0xc0)
+	else if ((*c & 0xe0) == 0xc0 && len >= 2)
 		return (char32_t) (((c[0] & 0x1f) << 6) |
 						   (c[1] & 0x3f));
-	else if ((*c & 0xf0) == 0xe0)
+	else if ((*c & 0xf0) == 0xe0 && len >= 3)
 		return (char32_t) (((c[0] & 0x0f) << 12) |
 						   ((c[1] & 0x3f) << 6) |
 						   (c[2] & 0x3f));
-	else if ((*c & 0xf8) == 0xf0)
+	else if ((*c & 0xf8) == 0xf0 && len >= 4)
 		return (char32_t) (((c[0] & 0x07) << 18) |
 						   ((c[1] & 0x3f) << 12) |
 						   ((c[2] & 0x3f) << 6) |
@@ -676,8 +674,6 @@ extern int	pg_valid_server_encoding(const char *name);
 extern bool is_encoding_supported_by_icu(int encoding);
 extern const char *get_encoding_name_for_icu(int encoding);
 
-extern unsigned char *unicode_to_utf8(char32_t c, unsigned char *utf8string);
-extern char32_t utf8_to_unicode(const unsigned char *c);
 extern bool pg_utf8_islegal(const unsigned char *source, int length);
 extern int	pg_utf_mblen(const unsigned char *s);
 extern int	pg_mule_mblen(const unsigned char *s);
-- 
2.43.0

Small patch to improve safety of utf8_to_unicode().

Reply via email to