Re: Small patch to improve safety of utf8_to_unicode().

Jeff Davis Wed, 24 Jun 2026 14:58:02 -0700

On Wed, 2026-06-24 at 16:44 +0800, Chao Li wrote:
> There is a compile warning against pg_wchar.h in 0004:


Fixed. I also used a loop in utf8decode() which is slightly smaller,
which is good if we intend it to be inlined by a lot of callers.

Regards,
        Jeff Davis

From 9a9bfbc4f3866d77e48933df052086a2558e7263 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 22 Jun 2026 16:30:01 -0700
Subject: [PATCH v5 1/5] unicode_case.c: defend against invalid UTF8.

Reviewed-by: Chao Li <[email protected]>
Discussion: https://postgr.es/m/[email protected]
Backpatch-through: 17
---
 src/backend/utils/adt/pg_locale_builtin.c | 24 ++++++++---
 src/common/unicode/case_test.c            |  8 ++++
 src/common/unicode_case.c                 | 52 +++++++++++++++++++----
 3 files changed, 70 insertions(+), 14 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 01d4f55b07e..7c36fd5091b 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -62,21 +62,33 @@ initcap_wbnext(void *state)
 
 	while (wbstate->offset < wbstate->len)
 	{
-		char32_t	u = utf8_to_unicode((const unsigned char *) wbstate->str +
+		int			ulen = pg_utf_mblen((const unsigned char *) wbstate->str +
 										wbstate->offset);
-		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
+		char32_t	u;
+		bool		curr_alnum;
+		size_t		prev_offset = wbstate->offset;
 
-		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+		/* invalid UTF8 */
+		if (wbstate->offset + ulen > wbstate->len)
 		{
-			size_t		prev_offset = wbstate->offset;
+			wbstate->init = true;
+			wbstate->offset = wbstate->len;
+			return prev_offset;
+		}
 
+		u = utf8_to_unicode((const unsigned char *) wbstate->str +
+							wbstate->offset);
+		curr_alnum = pg_u_isalnum(u, wbstate->posix);
+
+		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+		{
 			wbstate->init = true;
-			wbstate->offset += unicode_utf8len(u);
+			wbstate->offset += ulen;
 			wbstate->prev_alnum = curr_alnum;
 			return prev_offset;
 		}
 
-		wbstate->offset += unicode_utf8len(u);
+		wbstate->offset += ulen;
 	}
 
 	return wbstate->len;
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index a0dbf00b671..31ea94513bf 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -296,6 +296,8 @@ tfunc_fold(char *dst, size_t dstsize, const char *src,
 static void
 test_convert_case(void)
 {
+	size_t		needed;
+
 	/* test string with no case changes */
 	test_convert(tfunc_lower, "√∞", "√∞");
 	/* test adjust-to-cased behavior */
@@ -320,6 +322,12 @@ test_convert_case(void)
 	/* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
 	test_convert(tfunc_title, "\uFF11a", "\uFF11a");
 
+	/* invalid UTF8: truncated multibyte sequence */
+	needed = unicode_strfold(NULL, 0, "abc\xCE", 4, false);
+	Assert(needed == 3);
+	/* invalid UTF8: invalid byte */
+	needed = unicode_strfold(NULL, 0, "abc\xF8xyz", 7, false);
+	Assert(needed == 3);
 
 #ifdef USE_ICU
 	icu_test_full("");
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index d6ee00b7d9c..42eb7d22211 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -189,6 +189,22 @@ unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
 						NULL);
 }
 
+/* local version of pg_utf_mblen() to be inlinable */
+static int
+utf8_mblen(const unsigned char *s)
+{
+	if ((*s & 0x80) == 0)
+		return 1;
+	else if ((*s & 0xe0) == 0xc0)
+		return 2;
+	else if ((*s & 0xf0) == 0xe0)
+		return 3;
+	else if ((*s & 0xf8) == 0xf0)
+		return 4;
+	else
+		return -1;
+}
+
 /*
  * Implement Unicode Default Case Conversion algorithm.
  *
@@ -227,12 +243,18 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
 
 	while (srcoff < srclen)
 	{
-		char32_t	u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
-		int			u1len = unicode_utf8len(u1);
+		int			u1len = utf8_mblen((const unsigned char *) src + srcoff);
+		char32_t	u1;
 		char32_t	simple = 0;
 		const char32_t *special = NULL;
 		enum CaseMapResult casemap_result;
 
+		/* invalid UTF8 */
+		if (u1len < 0 || srcoff + u1len > srclen)
+			break;
+
+		u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
+
 		if (str_casekind == CaseTitle)
 		{
 			if (srcoff == boundary)
@@ -316,7 +338,14 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			char32_t	curr = utf8_to_unicode(str + i);
+			int			u1len = utf8_mblen((const unsigned char *) str + i);
+			char32_t	curr;
+
+			/* invalid UTF8 */
+			if (u1len < 0 || i + u1len > len)
+				return false;
+
+			curr = utf8_to_unicode(str + i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
@@ -327,8 +356,8 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 		}
 		else if ((str[i] & 0xC0) == 0x80)
 			continue;
-
-		Assert(false);			/* invalid UTF-8 */
+		else
+			return false;			/* invalid UTF8 */
 	}
 
 	/* end of string is not followed by a Cased character */
@@ -340,7 +369,14 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			char32_t	curr = utf8_to_unicode(str + i);
+			int			u1len = utf8_mblen((const unsigned char *) str + i);
+			char32_t	curr;
+
+			/* invalid UTF8 */
+			if (u1len < 0 || i + u1len > len)
+				return false;
+
+			curr = utf8_to_unicode(str + i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
@@ -351,8 +387,8 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 		}
 		else if ((str[i] & 0xC0) == 0x80)
 			continue;
-
-		Assert(false);			/* invalid UTF-8 */
+		else
+			return false;			/* invalid UTF8 */
 	}
 
 	return true;
-- 
2.43.0

From c389e8f3d47c51183702f468846c0ad8c33beaae Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Tue, 23 Jun 2026 17:09:49 -0700
Subject: [PATCH v5 2/5] pg_unicode_fast: fix final sigma logic.

If the string is preceded only by Case Ignorable characters, don't
consider it to be a final sigma.

In the process, refactor so that the preceding and following
characters are found first, and then the rule is applied, to improve
clarity.

Discussion: https://postgr.es/m/[email protected]
Backpatch-through: 18
---
 src/common/unicode_case.c                  | 88 ++++++++++------------
 src/test/regress/expected/collate.utf8.out |  6 ++
 src/test/regress/sql/collate.utf8.sql      |  1 +
 3 files changed, 47 insertions(+), 48 deletions(-)

diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 42eb7d22211..dd5b3ba86d0 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -323,75 +323,67 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
  * 3-17. The character at the given offset must be directly preceded by a
  * Cased character, and must not be directly followed by a Cased character.
  *
- * Case_Ignorable characters are ignored. NB: some characters may be both
+ * Case_Ignorable characters are ignored. Neither beginning of string nor end
+ * of string are considered Cased characters. NB: some characters may be both
  * Cased and Case_Ignorable, in which case they are ignored.
  */
 static bool
 check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 {
-	/* the start of the string is not preceded by a Cased character */
-	if (offset == 0)
-		return false;
+	bool		preceded_by_cased = false;
+	bool		followed_by_cased = false;
+	char32_t	curr;
+	int			ulen;
 
-	/* iterate backwards, looking for Cased character */
-	for (int i = offset - 1; i >= 0; i--)
+	/* iterate backwards looking for preceding character */
+	for (int i = offset; i > 0;)
 	{
-		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
-		{
-			int			u1len = utf8_mblen((const unsigned char *) str + i);
-			char32_t	curr;
+		/* skip backwards through continuation bytes */
+		i--;
+		if ((str[i] & 0xC0) == 0x80)
+			continue;
 
-			/* invalid UTF8 */
-			if (u1len < 0 || i + u1len > len)
-				return false;
+		/* now at leading byte of previous sequence */
+		Assert((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0);
 
-			curr = utf8_to_unicode(str + i);
+		ulen = utf8_mblen((const unsigned char *) str + i);
 
-			if (pg_u_prop_case_ignorable(curr))
-				continue;
-			else if (pg_u_prop_cased(curr))
-				break;
-			else
-				return false;
+		/* invalid UTF8 */
+		if (ulen < 0 || i + ulen > len)
+			return false;
+
+		curr = utf8_to_unicode((const unsigned char *) str + i);
+
+		if (!pg_u_prop_case_ignorable(curr))
+		{
+			preceded_by_cased = pg_u_prop_cased(curr);
+			break;
 		}
-		else if ((str[i] & 0xC0) == 0x80)
-			continue;
-		else
-			return false;			/* invalid UTF8 */
 	}
 
-	/* end of string is not followed by a Cased character */
-	if (offset == len)
-		return true;
+	ulen = utf8_mblen((const unsigned char *) str + offset);
 
-	/* iterate forwards, looking for Cased character */
-	for (int i = offset + 1; i < len && str[i] != '\0'; i++)
+	/* iterate forward looking for following character */
+	for (int i = offset + ulen; i < len;)
 	{
-		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
-		{
-			int			u1len = utf8_mblen((const unsigned char *) str + i);
-			char32_t	curr;
+		ulen = utf8_mblen((const unsigned char *) str + i);
 
-			/* invalid UTF8 */
-			if (u1len < 0 || i + u1len > len)
-				return false;
+		/* invalid UTF8 */
+		if (ulen < 0 || i + ulen > len)
+			return false;
 
-			curr = utf8_to_unicode(str + i);
+		curr = utf8_to_unicode((const unsigned char *) str + i);
 
-			if (pg_u_prop_case_ignorable(curr))
-				continue;
-			else if (pg_u_prop_cased(curr))
-				return false;
-			else
-				break;
+		if (!pg_u_prop_case_ignorable(curr))
+		{
+			followed_by_cased = pg_u_prop_cased(curr);
+			break;
 		}
-		else if ((str[i] & 0xC0) == 0x80)
-			continue;
-		else
-			return false;			/* invalid UTF8 */
+
+		i += ulen;
 	}
 
-	return true;
+	return (preceded_by_cased && !followed_by_cased);
 }
 
 /*
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
index 0c3ab5c89b2..99fdc111fa4 100644
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -263,6 +263,12 @@ SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
  ᾳσͅα
 (1 row)
 
+SELECT lower(U&'\0300\03A3' COLLATE PG_UNICODE_FAST);
+ lower 
+-------
+ ̀σ
+(1 row)
+
 -- properties
 SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
  ?column? 
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
index d6d14220ab3..22aecee3a60 100644
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -128,6 +128,7 @@ SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3
 SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391
 SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391
 SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
+SELECT lower(U&'\0300\03A3' COLLATE PG_UNICODE_FAST);
 
 -- properties
 
-- 
2.43.0

From 7f2494e90f041cb6e118c0a6a062b9e5364bc799 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 22 Jun 2026 16:31:38 -0700
Subject: [PATCH v5 3/5] unicode_case.c: change API to signal UTF8 decoding
 error.

Errors at this point are not expected, but if encountered, signal to
the caller so it can raise the appropriate error.

Discussion: https://postgr.es/m/[email protected]
---
 src/backend/utils/adt/pg_locale_builtin.c | 50 +++++++++++++++++++----
 src/common/unicode/case_test.c            | 37 ++++++++++-------
 src/common/unicode_case.c                 | 46 ++++++++++++---------
 src/include/common/unicode_case.h         |  8 ++--
 4 files changed, 94 insertions(+), 47 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 7c36fd5091b..5619daf43c3 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -98,8 +98,16 @@ static size_t
 strlower_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
 				 pg_locale_t locale)
 {
-	return unicode_strlower(dest, destsize, src, srclen,
-							locale->builtin.casemap_full);
+	size_t		consumed;
+	size_t		result;
+
+	result = unicode_strlower(dest, destsize, src, srclen, &consumed,
+							  locale->builtin.casemap_full);
+	if (consumed < srclen)
+		report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+								srclen - consumed);
+
+	return result;
 }
 
 static size_t
@@ -114,26 +122,50 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
 		.init = false,
 		.prev_alnum = false,
 	};
+	size_t		consumed;
+	size_t		result;
 
-	return unicode_strtitle(dest, destsize, src, srclen,
-							locale->builtin.casemap_full,
-							initcap_wbnext, &wbstate);
+	result = unicode_strtitle(dest, destsize, src, srclen, &consumed,
+							  locale->builtin.casemap_full,
+							  initcap_wbnext, &wbstate);
+
+	if (consumed < srclen)
+		report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+								srclen - consumed);
+
+	return result;
 }
 
 static size_t
 strupper_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
 				 pg_locale_t locale)
 {
-	return unicode_strupper(dest, destsize, src, srclen,
-							locale->builtin.casemap_full);
+	size_t		consumed;
+	size_t		result;
+
+	result = unicode_strupper(dest, destsize, src, srclen, &consumed,
+							  locale->builtin.casemap_full);
+	if (consumed < srclen)
+		report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+								srclen - consumed);
+
+	return result;
 }
 
 static size_t
 strfold_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
 				pg_locale_t locale)
 {
-	return unicode_strfold(dest, destsize, src, srclen,
-						   locale->builtin.casemap_full);
+	size_t		consumed;
+	size_t		result;
+
+	result = unicode_strfold(dest, destsize, src, srclen, &consumed,
+							 locale->builtin.casemap_full);
+	if (consumed < srclen)
+		report_invalid_encoding(GetDatabaseEncoding(), src + consumed,
+								srclen - consumed);
+
+	return result;
 }
 
 static bool
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index 31ea94513bf..08421d9e5ca 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -115,6 +115,7 @@ icu_test_full(char *str)
 	char		icu_fold[BUFSZ];
 	UErrorCode	status;
 	size_t		len = strlen(str);
+	size_t		consumed;
 
 	/* full case mapping doesn't use posix semantics */
 	struct WordBoundaryState wbstate = {
@@ -126,10 +127,10 @@ icu_test_full(char *str)
 		.prev_alnum = false,
 	};
 
-	unicode_strlower(lower, BUFSZ, str, len, true);
-	unicode_strtitle(title, BUFSZ, str, len, true, initcap_wbnext, &wbstate);
-	unicode_strupper(upper, BUFSZ, str, len, true);
-	unicode_strfold(fold, BUFSZ, str, len, true);
+	unicode_strlower(lower, BUFSZ, str, len, &consumed, true);
+	unicode_strtitle(title, BUFSZ, str, len, &consumed, true, initcap_wbnext, &wbstate);
+	unicode_strupper(upper, BUFSZ, str, len, &consumed, true);
+	unicode_strfold(fold, BUFSZ, str, len, &consumed, true);
 	status = U_ZERO_ERROR;
 	ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, len, &status);
 	status = U_ZERO_ERROR;
@@ -260,13 +261,16 @@ static size_t
 tfunc_lower(char *dst, size_t dstsize, const char *src,
 			size_t srclen)
 {
-	return unicode_strlower(dst, dstsize, src, srclen, true);
+	size_t		consumed;
+
+	return unicode_strlower(dst, dstsize, src, srclen, &consumed, true);
 }
 
 static size_t
 tfunc_title(char *dst, size_t dstsize, const char *src,
 			size_t srclen)
 {
+	size_t		consumed;
 	struct WordBoundaryState wbstate = {
 		.str = src,
 		.len = srclen,
@@ -275,28 +279,33 @@ tfunc_title(char *dst, size_t dstsize, const char *src,
 		.prev_alnum = false,
 	};
 
-	return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
-							&wbstate);
+	return unicode_strtitle(dst, dstsize, src, srclen, &consumed, true,
+							initcap_wbnext, &wbstate);
 }
 
 static size_t
 tfunc_upper(char *dst, size_t dstsize, const char *src,
 			size_t srclen)
 {
-	return unicode_strupper(dst, dstsize, src, srclen, true);
+	size_t		consumed;
+
+	return unicode_strupper(dst, dstsize, src, srclen, &consumed, true);
 }
 
 static size_t
 tfunc_fold(char *dst, size_t dstsize, const char *src,
 		   size_t srclen)
 {
-	return unicode_strfold(dst, dstsize, src, srclen, true);
+	size_t		consumed;
+
+	return unicode_strfold(dst, dstsize, src, srclen, &consumed, true);
 }
 
 static void
 test_convert_case(void)
 {
 	size_t		needed;
+	size_t		consumed;
 
 	/* test string with no case changes */
 	test_convert(tfunc_lower, "√∞", "√∞");
@@ -323,11 +332,11 @@ test_convert_case(void)
 	test_convert(tfunc_title, "\uFF11a", "\uFF11a");
 
 	/* invalid UTF8: truncated multibyte sequence */
-	needed = unicode_strfold(NULL, 0, "abc\xCE", 4, false);
-	Assert(needed == 3);
-	/* invalid UTF8: invalid byte */
-	needed = unicode_strfold(NULL, 0, "abc\xF8xyz", 7, false);
-	Assert(needed == 3);
+	needed = unicode_strfold(NULL, 0, "abc\xCE", 4, &consumed, false);
+	Assert(needed == 3 && consumed == 3);
+	/* invalid UTF8: leading byte invalid length */
+	needed = unicode_strfold(NULL, 0, "abc\xF8xyz", 7, &consumed, false);
+	Assert(needed == 3 && consumed == 3);
 
 #ifdef USE_ICU
 	icu_test_full("");
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index dd5b3ba86d0..24753aaab09 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -40,8 +40,8 @@ static const char32_t *const casekind_map[NCaseKind] =
 
 static char32_t find_case_map(char32_t ucs, const char32_t *map);
 static size_t convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
-						   CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
-						   void *wbstate);
+						   size_t *pconsumed, CaseKind str_casekind, bool full,
+						   WordBoundaryNext wbnext, void *wbstate);
 static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
 								  const char *src, size_t srclen, size_t srcoff,
 								  char32_t *simple, const char32_t **special);
@@ -82,7 +82,8 @@ unicode_casefold_simple(char32_t code)
  * unicode_strlower()
  *
  * Convert src to lowercase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
  *
  * String src must be encoded in UTF-8.
  *
@@ -98,17 +99,18 @@ unicode_casefold_simple(char32_t code)
  */
 size_t
 unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen,
-				 bool full)
+				 size_t *pconsumed, bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
-						NULL);
+	return convert_case(dst, dstsize, src, srclen, pconsumed, CaseLower, full,
+						NULL, NULL);
 }
 
 /*
  * unicode_strtitle()
  *
  * Convert src to titlecase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
  *
  * String src must be encoded in UTF-8.
  *
@@ -134,17 +136,19 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen,
  */
 size_t
 unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen,
-				 bool full, WordBoundaryNext wbnext, void *wbstate)
+				 size_t *pconsumed, bool full, WordBoundaryNext wbnext,
+				 void *wbstate)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
-						wbstate);
+	return convert_case(dst, dstsize, src, srclen, pconsumed, CaseTitle, full,
+						wbnext, wbstate);
 }
 
 /*
  * unicode_strupper()
  *
  * Convert src to uppercase, and return the result length (not including
- * terminating NUL).
+ * terminating NUL). Sets *pconsumed to the amount of src successfully
+ * consumed; if less than srclen, indicates a decoding error.
  *
  * String src must be encoded in UTF-8.
  *
@@ -160,17 +164,18 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen,
  */
 size_t
 unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen,
-				 bool full)
+				 size_t *pconsumed, bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
-						NULL);
+	return convert_case(dst, dstsize, src, srclen, pconsumed, CaseUpper, full,
+						NULL, NULL);
 }
 
 /*
  * unicode_strfold()
  *
  * Case fold src, and return the result length (not including terminating
- * NUL).
+ * NUL). Sets *pconsumed to the amount of src successfully consumed; if less
+ * than srclen, indicates a decoding error.
  *
  * String src must be encoded in UTF-8.
  *
@@ -183,10 +188,10 @@ unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen,
  */
 size_t
 unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
-				bool full)
+				size_t *pconsumed, bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
-						NULL);
+	return convert_case(dst, dstsize, src, srclen, pconsumed, CaseFold, full,
+						NULL, NULL);
 }
 
 /* local version of pg_utf_mblen() to be inlinable */
@@ -223,8 +228,8 @@ utf8_mblen(const unsigned char *s)
  */
 static size_t
 convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
-			 CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
-			 void *wbstate)
+			 size_t *pconsumed, CaseKind str_casekind, bool full,
+			 WordBoundaryNext wbnext, void *wbstate)
 {
 	/* character CaseKind varies while titlecasing */
 	CaseKind	chr_casekind = str_casekind;
@@ -315,6 +320,7 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
 	if (result_len < dstsize)
 		dst[result_len] = '\0';
 
+	*pconsumed = srcoff;
 	return result_len;
 }
 
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h
index 03add78cabe..1cbc0c14bc2 100644
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -21,13 +21,13 @@ char32_t	unicode_titlecase_simple(char32_t code);
 char32_t	unicode_uppercase_simple(char32_t code);
 char32_t	unicode_casefold_simple(char32_t code);
 size_t		unicode_strlower(char *dst, size_t dstsize, const char *src,
-							 size_t srclen, bool full);
+							 size_t srclen, size_t *pconsumed, bool full);
 size_t		unicode_strtitle(char *dst, size_t dstsize, const char *src,
-							 size_t srclen, bool full,
+							 size_t srclen, size_t *pconsumed, bool full,
 							 WordBoundaryNext wbnext, void *wbstate);
 size_t		unicode_strupper(char *dst, size_t dstsize, const char *src,
-							 size_t srclen, bool full);
+							 size_t srclen, size_t *pconsumed, bool full);
 size_t		unicode_strfold(char *dst, size_t dstsize, const char *src,
-							size_t srclen, bool full);
+							size_t srclen, size_t *pconsumed, bool full);
 
 #endif							/* UNICODE_CASE_H */
-- 
2.43.0

From 08acf6743051710355b6c8c6c9102a7922a3d163 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Thu, 4 Jun 2026 12:08:51 -0700
Subject: [PATCH v5 4/5] Validating, iterator-friendly UTF8 encoder/decoder
 API.

Discussion: https://postgr.es/m/[email protected]
---
 src/include/mb/pg_wchar.h | 139 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 137 insertions(+), 2 deletions(-)

diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index deee2a832c3..bc445be0678 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -27,6 +27,11 @@
  */
 typedef unsigned int pg_wchar;
 
+/*
+ * Returned for decoding failures in utf8decode() and utf8_to_unicode().
+ */
+#define PG_INVALID_CODEPOINT	0xFFFFFFFF
+
 /*
  * Maximum byte length of multibyte characters in any backend encoding
  */
@@ -392,11 +397,140 @@ surrogate_pair_to_codepoint(char16_t first, char16_t second)
 	return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
 }
 
+/*
+ * Encode the codepoint as UTF8 and return the number of bytes required. If
+ * the number of bytes required exceeds dstsize, just return the number of
+ * bytes required without modifying dst. If dstsize is zero, dst may be
+ * NULL. If codepoint is not a valid Unicode Scalar, return -1.
+ */
+static inline int
+utf8encode(unsigned char *dst, size_t dstsize, char32_t codepoint)
+{
+	int			nbytes;
+
+	if (codepoint <= 0x7F)
+		nbytes = 1;
+	else if (codepoint <= 0x7FF)
+		nbytes = 2;
+	else if (codepoint <= 0xFFFF)
+	{
+		/* surrogate halves not valid for UTF8 */
+		if (codepoint >= 0xD800 && codepoint <= 0xDFFF)
+			return -1;
+		nbytes = 3;
+	}
+	else if (codepoint <= 0x10FFFF)
+		nbytes = 4;
+	else
+		return -1;
+
+	if ((size_t) nbytes > dstsize)
+		return nbytes;
+
+	if (codepoint <= 0x7F)
+	{
+		dst[0] = codepoint;
+	}
+	else if (codepoint <= 0x7FF)
+	{
+		dst[0] = 0xC0 | ((codepoint >> 6) & 0x1F);
+		dst[1] = 0x80 | (codepoint & 0x3F);
+	}
+	else if (codepoint <= 0xFFFF)
+	{
+		dst[0] = 0xE0 | ((codepoint >> 12) & 0x0F);
+		dst[1] = 0x80 | ((codepoint >> 6) & 0x3F);
+		dst[2] = 0x80 | (codepoint & 0x3F);
+	}
+	else if (codepoint <= 0x10FFFF)
+	{
+		dst[0] = 0xF0 | ((codepoint >> 18) & 0x07);
+		dst[1] = 0x80 | ((codepoint >> 12) & 0x3F);
+		dst[2] = 0x80 | ((codepoint >> 6) & 0x3F);
+		dst[3] = 0x80 | (codepoint & 0x3F);
+	}
+
+	return nbytes;
+}
+
+/*
+ * Decode the next Unicode codepoint from UTF8 at src, reading no more than
+ * srclen bytes (which must be at least 1). On success, *pcodepoint will be a
+ * valid Unicode Scalar; otherwise it will be set to PG_INVALID_CODEPOINT.
+ *
+ * Returns the number of bytes consumed. If srclen is not large enough
+ * (i.e. src is truncated in the middle of a sequence), returns 0. If invalid,
+ * returns -1.
+ */
+static inline int
+utf8decode(char32_t *pcodepoint, const unsigned char *src, size_t srclen)
+{
+	int			nbytes;
+	char32_t	codepoint;
+	char32_t	min;
+
+	Assert(srclen >= 1);
+
+	if ((*src & 0x80) == 0)
+	{
+		*pcodepoint = (char32_t) src[0];
+		return 1;
+	}
+	else if ((*src & 0xe0) == 0xc0)
+	{
+		nbytes = 2;
+		min = 0x80;
+		codepoint = (char32_t) src[0] & 0x1f;
+	}
+	else if ((*src & 0xf0) == 0xe0)
+	{
+		nbytes = 3;
+		min = 0x800;
+		codepoint = (char32_t) src[0] & 0x0f;
+	}
+	else if ((*src & 0xf8) == 0xf0)
+	{
+		nbytes = 4;
+		min = 0x10000;
+		codepoint = (char32_t) src[0] & 0x07;
+	}
+	else
+		goto invalid;		/* invalid leading byte */
+
+	/* truncated multibyte sequence */
+	if (srclen < (size_t) nbytes)
+	{
+		*pcodepoint = PG_INVALID_CODEPOINT;
+		return 0;
+	}
+
+	for (int i = 1; i < nbytes; i++)
+	{
+		if ((src[i] & 0xc0) != 0x80)
+			goto invalid;
+		codepoint = (codepoint << 6) | (src[i] & 0x3f);
+	}
+
+	/* reject overlong, surrogate, and out-of-range */
+	if (codepoint < min || codepoint > 0x10FFFF ||
+		(codepoint >= 0xD800 && codepoint <= 0xDFFF))
+		goto invalid;
+
+	*pcodepoint = codepoint;
+	return nbytes;
+
+invalid:
+	*pcodepoint = PG_INVALID_CODEPOINT;
+	return -1;
+}
+
 /*
  * Convert a UTF-8 character to a Unicode code point.
  * This is a one-character version of pg_utf2wchar_with_len.
  *
  * No error checks here, c must point to a long-enough string.
+ *
+ * XXX: Callers should consider utf8decode() instead.
  */
 static inline char32_t
 utf8_to_unicode(const unsigned char *c)
@@ -416,13 +550,14 @@ utf8_to_unicode(const unsigned char *c)
 						   ((c[2] & 0x3f) << 6) |
 						   (c[3] & 0x3f));
 	else
-		/* that is an invalid code on purpose */
-		return 0xffffffff;
+		return PG_INVALID_CODEPOINT;
 }
 
 /*
  * Map a Unicode code point to UTF-8.  utf8string must have at least
  * unicode_utf8len(c) bytes available.
+ *
+ * XXX: Callers should consider utf8encode() instead.
  */
 static inline unsigned char *
 unicode_to_utf8(char32_t c, unsigned char *utf8string)
-- 
2.43.0

From 91a0f3cac34d3a4030cd8adf8553261f33804d6d Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Fri, 19 Jun 2026 14:58:47 -0700
Subject: [PATCH v5 5/5] unicode_case.c: use new utf8encode/utf8decode APIs.

Discussion: https://postgr.es/m/[email protected]
---
 src/backend/utils/adt/pg_locale_builtin.c | 10 +--
 src/common/unicode/case_test.c            | 45 +++++++++----
 src/common/unicode_case.c                 | 77 +++++++++++------------
 3 files changed, 77 insertions(+), 55 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 5619daf43c3..63516e7174f 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -62,22 +62,22 @@ initcap_wbnext(void *state)
 
 	while (wbstate->offset < wbstate->len)
 	{
-		int			ulen = pg_utf_mblen((const unsigned char *) wbstate->str +
-										wbstate->offset);
+		int			ulen;
 		char32_t	u;
 		bool		curr_alnum;
 		size_t		prev_offset = wbstate->offset;
 
+		ulen = utf8decode(&u, (const unsigned char *) wbstate->str + wbstate->offset,
+						  wbstate->len - wbstate->offset);
+
 		/* invalid UTF8 */
-		if (wbstate->offset + ulen > wbstate->len)
+		if (ulen <= 0)
 		{
 			wbstate->init = true;
 			wbstate->offset = wbstate->len;
 			return prev_offset;
 		}
 
-		u = utf8_to_unicode((const unsigned char *) wbstate->str +
-							wbstate->offset);
 		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index 08421d9e5ca..9461b56742b 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -52,24 +52,35 @@ initcap_wbnext(void *state)
 {
 	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
 
-	while (wbstate->offset < wbstate->len &&
-		   wbstate->str[wbstate->offset] != '\0')
+	while (wbstate->offset < wbstate->len)
 	{
-		char32_t	u = utf8_to_unicode((const unsigned char *) wbstate->str +
-										wbstate->offset);
-		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
+		int			ulen;
+		char32_t	u;
+		bool		curr_alnum;
+		size_t		prev_offset = wbstate->offset;
 
-		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+		ulen = utf8decode(&u, (const unsigned char *) wbstate->str + wbstate->offset,
+						  wbstate->len - wbstate->offset);
+
+		/* invalid UTF8 */
+		if (ulen <= 0)
 		{
-			size_t		prev_offset = wbstate->offset;
+			wbstate->init = true;
+			wbstate->offset = wbstate->len;
+			return prev_offset;
+		}
+
+		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
+		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+		{
 			wbstate->init = true;
-			wbstate->offset += unicode_utf8len(u);
+			wbstate->offset += ulen;
 			wbstate->prev_alnum = curr_alnum;
 			return prev_offset;
 		}
 
-		wbstate->offset += unicode_utf8len(u);
+		wbstate->offset += ulen;
 	}
 
 	return wbstate->len;
@@ -179,7 +190,7 @@ test_icu(void)
 	{
 		pg_unicode_category category = unicode_category(code);
 
-		if (category != PG_U_UNASSIGNED)
+		if (category != PG_U_UNASSIGNED && category != PG_U_SURROGATE)
 		{
 			uint8_t		icu_category = u_charType(code);
 			char		code_str[5] = {0};
@@ -191,7 +202,7 @@ test_icu(void)
 			}
 
 			icu_test_simple(code);
-			unicode_to_utf8(code, (unsigned char *) code_str);
+			utf8encode((unsigned char *) code_str, 5, code);
 			icu_test_full(code_str);
 
 			successful++;
@@ -337,6 +348,18 @@ test_convert_case(void)
 	/* invalid UTF8: leading byte invalid length */
 	needed = unicode_strfold(NULL, 0, "abc\xF8xyz", 7, &consumed, false);
 	Assert(needed == 3 && consumed == 3);
+	/* invalid UTF8: surrogates */
+	needed = unicode_strfold(NULL, 0, "abc\xED\xA0\x81xyz", 7, &consumed, false);
+	Assert(needed == 3 && consumed == 3);
+	/* invalid UTF8: continuation with no leading byte */
+	needed = unicode_strfold(NULL, 0, "abc\x80xyz", 7, &consumed, false);
+	Assert(needed == 3 && consumed == 3);
+	/* invalid UTF8: out of range */
+	needed = unicode_strfold(NULL, 0, "abc\xF5\x80\x80\x80xyz", 7, &consumed, false);
+	Assert(needed == 3 && consumed == 3);
+	/* invalid UTF8: overlong */
+	needed = unicode_strfold(NULL, 0, "abc\xC1\xBFxyz", 7, &consumed, false);
+	Assert(needed == 3 && consumed == 3);
 
 #ifdef USE_ICU
 	icu_test_full("");
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 24753aaab09..4d8ee71e8dc 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -194,22 +194,6 @@ unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
 						NULL, NULL);
 }
 
-/* local version of pg_utf_mblen() to be inlinable */
-static int
-utf8_mblen(const unsigned char *s)
-{
-	if ((*s & 0x80) == 0)
-		return 1;
-	else if ((*s & 0xe0) == 0xc0)
-		return 2;
-	else if ((*s & 0xf0) == 0xe0)
-		return 3;
-	else if ((*s & 0xf8) == 0xf0)
-		return 4;
-	else
-		return -1;
-}
-
 /*
  * Implement Unicode Default Case Conversion algorithm.
  *
@@ -248,18 +232,19 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
 
 	while (srcoff < srclen)
 	{
-		int			u1len = utf8_mblen((const unsigned char *) src + srcoff);
 		char32_t	u1;
+		int			u1len;
 		char32_t	simple = 0;
 		const char32_t *special = NULL;
 		enum CaseMapResult casemap_result;
 
+		u1len = utf8decode(&u1, (const unsigned char *) src + srcoff,
+						   srclen - srcoff);
+
 		/* invalid UTF8 */
-		if (u1len < 0 || srcoff + u1len > srclen)
+		if (u1len <= 0)
 			break;
 
-		u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
-
 		if (str_casekind == CaseTitle)
 		{
 			if (srcoff == boundary)
@@ -280,6 +265,7 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
 				/* no mapping; copy bytes from src */
 				Assert(simple == 0);
 				Assert(special == NULL);
+
 				if (result_len + u1len <= dstsize)
 					memcpy(dst + result_len, src + srcoff, u1len);
 
@@ -289,11 +275,19 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
 				{
 					/* replace with single character */
 					char32_t	u2 = simple;
-					char32_t	u2len = unicode_utf8len(u2);
+					int			u2len;
+					size_t		remaining = 0;
+					unsigned char *p = NULL;
+
+					if (dstsize > result_len)
+					{
+						remaining = dstsize - result_len;
+						p = (unsigned char *) dst + result_len;
+					}
 
 					Assert(special == NULL);
-					if (result_len + u2len <= dstsize)
-						unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+					u2len = utf8encode(p, remaining, u2);
+					Assert(u2len > 0);
 
 					result_len += u2len;
 				}
@@ -304,10 +298,18 @@ convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
 				for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
 				{
 					char32_t	u2 = special[i];
-					size_t		u2len = unicode_utf8len(u2);
+					int			u2len;
+					size_t		remaining = 0;
+					unsigned char *p = NULL;
+
+					if (dstsize > result_len)
+					{
+						remaining = dstsize - result_len;
+						p = (unsigned char *) dst + result_len;
+					}
 
-					if (result_len + u2len <= dstsize)
-						unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+					u2len = utf8encode(p, remaining, u2);
+					Assert(u2len > 0);
 
 					result_len += u2len;
 				}
@@ -352,13 +354,10 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 		/* now at leading byte of previous sequence */
 		Assert((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0);
 
-		ulen = utf8_mblen((const unsigned char *) str + i);
-
-		/* invalid UTF8 */
-		if (ulen < 0 || i + ulen > len)
-			return false;
+		ulen = utf8decode(&curr, (const unsigned char *) str + i, len - i);
 
-		curr = utf8_to_unicode((const unsigned char *) str + i);
+		if (ulen <= 0)
+			return false;		/* invalid UTF8 */
 
 		if (!pg_u_prop_case_ignorable(curr))
 		{
@@ -367,18 +366,18 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 		}
 	}
 
-	ulen = utf8_mblen((const unsigned char *) str + offset);
+	ulen = utf8decode(&curr, (const unsigned char *) str + offset,
+					  len - offset);
+	if (ulen <= 0)
+		return false;			/* invalid UTF8 */
 
 	/* iterate forward looking for following character */
 	for (int i = offset + ulen; i < len;)
 	{
-		ulen = utf8_mblen((const unsigned char *) str + i);
-
-		/* invalid UTF8 */
-		if (ulen < 0 || i + ulen > len)
-			return false;
+		ulen = utf8decode(&curr, (const unsigned char *) str + i, len - i);
 
-		curr = utf8_to_unicode((const unsigned char *) str + i);
+		if (ulen <= 0)
+			return false;		/* invalid UTF8 */
 
 		if (!pg_u_prop_case_ignorable(curr))
 		{
-- 
2.43.0

Re: Small patch to improve safety of utf8_to_unicode().

Reply via email to