Author: brane
Date: Sat May 23 07:56:39 2026
New Revision: 1934528
Log:
Add a utility function to find grapheme boundaries in a UTF-8 string.
* subversion/include/private/svn_utf_private.h
(svn_utf__utf8_grapheme_t): Descibe a graphemes position in the string
and its visual width.
(svn_utf__cstring_utf8_grapheme_breaks): New prototype.
* subversion/libsvn_subr/utf8proc.c: Include limits.h.
(svn_utf__cstring_utf8_grapheme_breaks): Implement the new function.
(svn_utf_cstring_utf8_width): Reimplement as a wrapper for the above.
* subversion/tests/libsvn_subr/utf-test.c
(test_utf8_width): Add a test for the empty string, and test grapheme
breakdown on the same test data.
Modified:
subversion/trunk/subversion/include/private/svn_utf_private.h
subversion/trunk/subversion/libsvn_subr/utf8proc.c
subversion/trunk/subversion/tests/libsvn_subr/utf-test.c
Modified: subversion/trunk/subversion/include/private/svn_utf_private.h
==============================================================================
--- subversion/trunk/subversion/include/private/svn_utf_private.h Sat May
23 06:12:43 2026 (r1934527)
+++ subversion/trunk/subversion/include/private/svn_utf_private.h Sat May
23 07:56:39 2026 (r1934528)
@@ -291,6 +291,37 @@ svn_utf__utf32_to_utf8(const svn_string_
apr_pool_t *result_pool,
apr_pool_t *scratch_pool);
+/*
+ * Describes one Unicode grapheme within a UTF-8 string.
+ */
+typedef struct svn_utf__utf8_grapheme_t
+{
+ /* The index of the beginning of the grapheme. */
+ apr_size_t start;
+ /* The index of the byte after the end of the grapheme. */
+ apr_size_t end;
+ /* The estimated visual width of the grapheme. */
+ int width;
+} svn_utf__utf8_grapheme_t;
+
+/*
+ * Find grapheme boundaries within a UTF-8 string CSTR. Return the total
+ * estimated width of all the graphemes in the string. Set *GRAPHEMES to
+ * an array of svn_utf__utf8_grapheme_t allocated from POOL. The final
+ * grapheme in the returned array may not be complete; we don't check if
+ * a grapheme break is allowed at the end bcause it's, well, the end.
+ *
+ * *GRAPHEMES will be NULL if CSTR is empty.
+ *
+ * If GRAPHEMES is NULL, the list of graphemes will not be allocated
+ * and POOL may also be NULL.
+ *
+ * If CSTR is not a valid UTF-8 string, the returned value will be negative.
+ */
+apr_ssize_t
+svn_utf__cstring_utf8_grapheme_breaks(apr_array_header_t **graphemes,
+ const char *cstr,
+ apr_pool_t *pool);
/* Return a new string with a copy of @a cstr allocated in @a pool aligned to
* the right side with spaces. This function takes UTF-8 multibyte encoding and
Modified: subversion/trunk/subversion/libsvn_subr/utf8proc.c
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf8proc.c Sat May 23 06:12:43
2026 (r1934527)
+++ subversion/trunk/subversion/libsvn_subr/utf8proc.c Sat May 23 07:56:39
2026 (r1934528)
@@ -23,6 +23,7 @@
+#include <limits.h>
#include <apr_fnmatch.h>
#include "svn_utf.h"
@@ -608,6 +609,104 @@ svn_utf__fuzzy_escape(const char *src, a
return result->data;
}
+apr_ssize_t
+svn_utf__cstring_utf8_grapheme_breaks(apr_array_header_t **graphemes,
+ const char *cstr,
+ apr_pool_t *pool)
+{
+ apr_array_header_t *breaks = NULL;
+ apr_ssize_t total_width = 0;
+
+ utf8proc_int32_t state = 0;
+ utf8proc_int32_t codepoint1;
+ utf8proc_int32_t codepoint2;
+
+ int grapheme_width = 0;
+ apr_size_t grapheme_start = 0;
+ apr_size_t grapheme_end = 0;
+
+ utf8proc_ssize_t nbytes;
+ const utf8proc_uint8_t *utf8 = (const utf8proc_uint8_t *)cstr;
+ if (!*utf8)
+ {
+ if (graphemes)
+ *graphemes = NULL;
+ return 0;
+ }
+
+ nbytes = utf8proc_iterate(utf8, -1, &codepoint1);
+ if (nbytes < 0)
+ return -1;
+
+ if (graphemes)
+ breaks = apr_array_make(pool, 16, sizeof(svn_utf__utf8_grapheme_t));
+ grapheme_width += utf8proc_charwidth(codepoint1);
+ grapheme_end += nbytes;
+ utf8 += nbytes;
+
+ while(*utf8)
+ {
+ nbytes = utf8proc_iterate(utf8, -1, &codepoint2);
+ if (nbytes < 0)
+ return -1;
+
+ if (utf8proc_grapheme_break_stateful(codepoint1, codepoint2, &state))
+ {
+ if (breaks)
+ {
+ svn_utf__utf8_grapheme_t grapheme;
+ grapheme.start = grapheme_start;
+ grapheme.end = grapheme_end;
+ grapheme.width = grapheme_width;
+ APR_ARRAY_PUSH(breaks, svn_utf__utf8_grapheme_t) = grapheme;
+ }
+
+ total_width += grapheme_width;
+ grapheme_width = 0;
+ grapheme_start = grapheme_end;
+ }
+
+ codepoint1 = codepoint2;
+ grapheme_width += utf8proc_charwidth(codepoint1);
+ grapheme_end += nbytes;
+ utf8 += nbytes;
+ }
+
+ /* Record the final grapheme. */
+ if (grapheme_end > grapheme_start)
+ {
+ if (breaks)
+ {
+ svn_utf__utf8_grapheme_t grapheme;
+ grapheme.start = grapheme_start;
+ grapheme.end = grapheme_end;
+ grapheme.width = grapheme_width;
+ APR_ARRAY_PUSH(breaks, svn_utf__utf8_grapheme_t) = grapheme;
+ }
+
+ total_width += grapheme_width;
+ }
+
+ if (breaks && graphemes)
+ *graphemes = breaks;
+ return total_width;
+}
+
+#if 1
+int
+svn_utf_cstring_utf8_width(const char *cstr)
+{
+ const apr_ssize_t width =
+ svn_utf__cstring_utf8_grapheme_breaks(NULL, cstr, NULL);
+
+ /* Check for return value overflow. It's unfortunate that we chose to use
+ 'int' for what is essentially a string length value. */
+ if (width > INT_MAX)
+ return -1;
+
+ return (int)width;
+}
+#else
int
svn_utf_cstring_utf8_width(const char *cstr)
{
@@ -641,6 +740,7 @@ svn_utf_cstring_utf8_width(const char *c
return width;
}
+#endif
/* Advances CSTR by N printable UTF-8 characters */
static const char *
Modified: subversion/trunk/subversion/tests/libsvn_subr/utf-test.c
==============================================================================
--- subversion/trunk/subversion/tests/libsvn_subr/utf-test.c Sat May 23
06:12:43 2026 (r1934527)
+++ subversion/trunk/subversion/tests/libsvn_subr/utf-test.c Sat May 23
07:56:39 2026 (r1934528)
@@ -1003,6 +1003,8 @@ test_utf_xfrm(apr_pool_t *pool)
static svn_error_t *
test_utf8_width(apr_pool_t *pool)
{
+ apr_array_header_t *graphemes;
+
/* there are three emojis that each have wcwidth of two */
const char *fat_emojis = "\xf0\x9f\xa5\xba\xf0\x9f\x91\x89\xf0\x9f\x91\x88";
const char *mixup =
@@ -1019,12 +1021,38 @@ test_utf8_width(apr_pool_t *pool)
const char *invalid = "a" "\xe6" "bc";
const char *bom = "\xEF\xBB\xBF" "abc";
+ /* Test the public API */
+ SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(""), 0);
SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width("abc123"), 6);
SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(fat_emojis), 6);
SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(mixup), 10);
SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(invalid), -1);
SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(bom), 3);
+ /* Test grapheme breakdown */
+ svn_utf__cstring_utf8_grapheme_breaks(&graphemes, "", pool);
+ SVN_TEST_ASSERT(graphemes == NULL);
+
+ svn_utf__cstring_utf8_grapheme_breaks(&graphemes, "abc123", pool);
+ SVN_TEST_INT_ASSERT(graphemes->nelts, 6);
+
+ svn_utf__cstring_utf8_grapheme_breaks(&graphemes, fat_emojis, pool);
+ SVN_TEST_INT_ASSERT(graphemes->nelts, 3);
+ SVN_TEST_INT_ASSERT(
+ APR_ARRAY_IDX(graphemes, 0, svn_utf__utf8_grapheme_t).width, 2);
+ SVN_TEST_INT_ASSERT(
+ APR_ARRAY_IDX(graphemes, 1, svn_utf__utf8_grapheme_t).width, 2);
+ SVN_TEST_INT_ASSERT(
+ APR_ARRAY_IDX(graphemes, 2, svn_utf__utf8_grapheme_t).width, 2);
+
+ svn_utf__cstring_utf8_grapheme_breaks(&graphemes, mixup, pool);
+ SVN_TEST_INT_ASSERT(graphemes->nelts, 10);
+
+ svn_utf__cstring_utf8_grapheme_breaks(&graphemes, bom, pool);
+ SVN_TEST_INT_ASSERT(graphemes->nelts, 4);
+ SVN_TEST_INT_ASSERT(
+ APR_ARRAY_IDX(graphemes, 0, svn_utf__utf8_grapheme_t).width, 0);
+
return SVN_NO_ERROR;
}