libsvn_subr

brane Sat, 23 May 2026 00:56:53 -0700

Author: brane
Date: Sat May 23 07:56:39 2026
New Revision: 1934528

Log:
Add a utility function to find grapheme boundaries in a UTF-8 string.


* subversion/include/private/svn_utf_private.h
  (svn_utf__utf8_grapheme_t): Descibe a graphemes position in the string
   and its visual width.
  (svn_utf__cstring_utf8_grapheme_breaks): New prototype.

* subversion/libsvn_subr/utf8proc.c: Include limits.h.
  (svn_utf__cstring_utf8_grapheme_breaks): Implement the new function.
  (svn_utf_cstring_utf8_width): Reimplement as a wrapper for the above.

* subversion/tests/libsvn_subr/utf-test.c
  (test_utf8_width): Add a test for the empty string, and test grapheme
   breakdown on the same test data.

Modified:
   subversion/trunk/subversion/include/private/svn_utf_private.h
   subversion/trunk/subversion/libsvn_subr/utf8proc.c
   subversion/trunk/subversion/tests/libsvn_subr/utf-test.c

Modified: subversion/trunk/subversion/include/private/svn_utf_private.h
==============================================================================
--- subversion/trunk/subversion/include/private/svn_utf_private.h       Sat May 
23 06:12:43 2026        (r1934527)
+++ subversion/trunk/subversion/include/private/svn_utf_private.h       Sat May 
23 07:56:39 2026        (r1934528)
@@ -291,6 +291,37 @@ svn_utf__utf32_to_utf8(const svn_string_
                        apr_pool_t *result_pool,
                        apr_pool_t *scratch_pool);
 
+/*
+ * Describes one Unicode grapheme within a UTF-8 string.
+ */
+typedef struct svn_utf__utf8_grapheme_t
+{
+  /* The index of the beginning of the grapheme. */
+  apr_size_t start;
+  /* The index of the byte after the end of the grapheme. */
+  apr_size_t end;
+  /* The estimated visual width of the grapheme. */
+  int width;
+} svn_utf__utf8_grapheme_t;
+
+/*
+ * Find grapheme boundaries within a UTF-8 string CSTR. Return the total
+ * estimated width of all the graphemes in the string. Set *GRAPHEMES to
+ * an array of svn_utf__utf8_grapheme_t allocated from POOL. The final
+ * grapheme in the returned array may not be complete; we don't check if
+ * a grapheme break is allowed at the end bcause it's, well, the end.
+ *
+ * *GRAPHEMES will be NULL if CSTR is empty.
+ *
+ * If GRAPHEMES is NULL, the list of graphemes will not be allocated
+ * and POOL may also be NULL.
+ *
+ * If CSTR is not a valid UTF-8 string, the returned value will be negative.
+ */
+apr_ssize_t
+svn_utf__cstring_utf8_grapheme_breaks(apr_array_header_t **graphemes,
+                                      const char *cstr,
+                                      apr_pool_t *pool);
 
 /* Return a new string with a copy of @a cstr allocated in @a pool aligned to
  * the right side with spaces. This function takes UTF-8 multibyte encoding and

Modified: subversion/trunk/subversion/libsvn_subr/utf8proc.c
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf8proc.c  Sat May 23 06:12:43 
2026        (r1934527)
+++ subversion/trunk/subversion/libsvn_subr/utf8proc.c  Sat May 23 07:56:39 
2026        (r1934528)
@@ -23,6 +23,7 @@
 
 
 
+#include <limits.h>
 #include <apr_fnmatch.h>
 
 #include "svn_utf.h"
@@ -608,6 +609,104 @@ svn_utf__fuzzy_escape(const char *src, a
   return result->data;
 }
 
+apr_ssize_t
+svn_utf__cstring_utf8_grapheme_breaks(apr_array_header_t **graphemes,
+                                      const char *cstr,
+                                      apr_pool_t *pool)
+{
+  apr_array_header_t *breaks = NULL;
+  apr_ssize_t total_width = 0;
+
+  utf8proc_int32_t state = 0;
+  utf8proc_int32_t codepoint1;
+  utf8proc_int32_t codepoint2;
+
+  int grapheme_width = 0;
+  apr_size_t grapheme_start = 0;
+  apr_size_t grapheme_end = 0;
+
+  utf8proc_ssize_t nbytes;
+  const utf8proc_uint8_t *utf8 = (const utf8proc_uint8_t *)cstr;
+  if (!*utf8)
+    {
+      if (graphemes)
+        *graphemes = NULL;
+      return 0;
+    }
+
+  nbytes = utf8proc_iterate(utf8, -1, &codepoint1);
+  if (nbytes < 0)
+    return -1;
+
+  if (graphemes)
+    breaks = apr_array_make(pool, 16, sizeof(svn_utf__utf8_grapheme_t));
+  grapheme_width += utf8proc_charwidth(codepoint1);
+  grapheme_end += nbytes;
+  utf8 += nbytes;
+
+  while(*utf8)
+    {
+      nbytes = utf8proc_iterate(utf8, -1, &codepoint2);
+      if (nbytes < 0)
+        return -1;
+
+      if (utf8proc_grapheme_break_stateful(codepoint1, codepoint2, &state))
+        {
+          if (breaks)
+            {
+              svn_utf__utf8_grapheme_t grapheme;
+              grapheme.start = grapheme_start;
+              grapheme.end = grapheme_end;
+              grapheme.width = grapheme_width;
+              APR_ARRAY_PUSH(breaks, svn_utf__utf8_grapheme_t) = grapheme;
+            }
+
+          total_width += grapheme_width;
+          grapheme_width = 0;
+          grapheme_start = grapheme_end;
+        }
+
+      codepoint1 = codepoint2;
+      grapheme_width += utf8proc_charwidth(codepoint1);
+      grapheme_end += nbytes;
+      utf8 += nbytes;
+    }
+
+  /* Record the final grapheme. */
+  if (grapheme_end > grapheme_start)
+    {
+      if (breaks)
+        {
+          svn_utf__utf8_grapheme_t grapheme;
+          grapheme.start = grapheme_start;
+          grapheme.end = grapheme_end;
+          grapheme.width = grapheme_width;
+          APR_ARRAY_PUSH(breaks, svn_utf__utf8_grapheme_t) = grapheme;
+        }
+
+      total_width += grapheme_width;
+    }
+
+  if (breaks && graphemes)
+    *graphemes = breaks;
+  return total_width;
+}
+
+#if 1
+int
+svn_utf_cstring_utf8_width(const char *cstr)
+{
+  const apr_ssize_t width =
+    svn_utf__cstring_utf8_grapheme_breaks(NULL, cstr, NULL);
+
+  /* Check for return value overflow. It's unfortunate that we chose to use
+     'int' for what is essentially a string length value. */
+  if (width > INT_MAX)
+    return -1;
+
+  return (int)width;
+}
+#else
 int
 svn_utf_cstring_utf8_width(const char *cstr)
 {
@@ -641,6 +740,7 @@ svn_utf_cstring_utf8_width(const char *c
 
   return width;
 }
+#endif
 
 /* Advances CSTR by N printable UTF-8 characters */
 static const char *

Modified: subversion/trunk/subversion/tests/libsvn_subr/utf-test.c
==============================================================================
--- subversion/trunk/subversion/tests/libsvn_subr/utf-test.c    Sat May 23 
06:12:43 2026        (r1934527)
+++ subversion/trunk/subversion/tests/libsvn_subr/utf-test.c    Sat May 23 
07:56:39 2026        (r1934528)
@@ -1003,6 +1003,8 @@ test_utf_xfrm(apr_pool_t *pool)
 static svn_error_t *
 test_utf8_width(apr_pool_t *pool)
 {
+  apr_array_header_t *graphemes;
+
   /* there are three emojis that each have wcwidth of two */
   const char *fat_emojis = "\xf0\x9f\xa5\xba\xf0\x9f\x91\x89\xf0\x9f\x91\x88";
   const char *mixup =
@@ -1019,12 +1021,38 @@ test_utf8_width(apr_pool_t *pool)
   const char *invalid = "a" "\xe6" "bc";
   const char *bom = "\xEF\xBB\xBF" "abc";
 
+  /* Test the public API */
+  SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(""), 0);
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width("abc123"), 6);
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(fat_emojis), 6);
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(mixup), 10);
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(invalid), -1);
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(bom), 3);
 
+  /* Test grapheme breakdown */
+  svn_utf__cstring_utf8_grapheme_breaks(&graphemes, "", pool);
+  SVN_TEST_ASSERT(graphemes == NULL);
+
+  svn_utf__cstring_utf8_grapheme_breaks(&graphemes, "abc123", pool);
+  SVN_TEST_INT_ASSERT(graphemes->nelts, 6);
+
+  svn_utf__cstring_utf8_grapheme_breaks(&graphemes, fat_emojis, pool);
+  SVN_TEST_INT_ASSERT(graphemes->nelts, 3);
+  SVN_TEST_INT_ASSERT(
+      APR_ARRAY_IDX(graphemes, 0, svn_utf__utf8_grapheme_t).width, 2);
+  SVN_TEST_INT_ASSERT(
+      APR_ARRAY_IDX(graphemes, 1, svn_utf__utf8_grapheme_t).width, 2);
+  SVN_TEST_INT_ASSERT(
+      APR_ARRAY_IDX(graphemes, 2, svn_utf__utf8_grapheme_t).width, 2);
+
+  svn_utf__cstring_utf8_grapheme_breaks(&graphemes, mixup, pool);
+  SVN_TEST_INT_ASSERT(graphemes->nelts, 10);
+
+  svn_utf__cstring_utf8_grapheme_breaks(&graphemes, bom, pool);
+  SVN_TEST_INT_ASSERT(graphemes->nelts, 4);
+  SVN_TEST_INT_ASSERT(
+      APR_ARRAY_IDX(graphemes, 0, svn_utf__utf8_grapheme_t).width, 0);
+
   return SVN_NO_ERROR;
 }

svn commit: r1934528 - in subversion/trunk/subversion: include/private libsvn_subr tests/libsvn_subr

Reply via email to