Author: rinrab
Date: Tue May 19 17:36:19 2026
New Revision: 1934407

Log:
Re-implement svn_utf_cstring_utf8_width() with use of libutf8proc.

Prior implementation of this function consisted of hand-written iterator that
was parsing a UTF-8 string by itself. Then it used a hard-coded dataset of
various ranges of Unicode characters to determine its width.

But we already have utf8proc as a required dependency - a wonderful library
that is specifically made to do all that and has a nice, fast, and up-to-date
dataset of all Unicode characters with all possible data one might ever need.
After this change, both the iteration and dataset are implemented with
utf8proc.

Please see a thread on [email protected] where this idea was discussed [1].

* subversion/libsvn_subr/utf8proc.c
  (svn_utf_cstring_utf8_width): New implementation with help of utf8proc.
* subversion/libsvn_subr/utf_width.c: Shamelessly delete the entire file.
* subversion/tests/libsvn_subr/utf-test.c
  (test_utf8_width): Update test expectation. Haha, now we have know about
   emojis! No need for Rust rewrites.

[1] https://lists.apache.org/thread/fjy7p4xf6vwj8qbv59zwf2zwb1ml5pqq

Deleted:
   subversion/trunk/subversion/libsvn_subr/utf_width.c
Modified:
   subversion/trunk/subversion/libsvn_subr/utf8proc.c
   subversion/trunk/subversion/tests/libsvn_subr/utf-test.c

Modified: subversion/trunk/subversion/libsvn_subr/utf8proc.c
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf8proc.c  Tue May 19 17:24:33 
2026        (r1934406)
+++ subversion/trunk/subversion/libsvn_subr/utf8proc.c  Tue May 19 17:36:19 
2026        (r1934407)
@@ -59,7 +59,6 @@ svn_utf__utf8proc_runtime_version(void)
 #ifdef UTF8PROC_STATIC
   /* Unused static function warning removal hack. */
   SVN_UNUSED(utf8proc_category_string);
-  SVN_UNUSED(utf8proc_charwidth);
   SVN_UNUSED(utf8proc_charwidth_ambiguous);
   SVN_UNUSED(utf8proc_grapheme_break);
   SVN_UNUSED(utf8proc_islower);
@@ -606,3 +605,40 @@ svn_utf__fuzzy_escape(const char *src, a
 
   return result->data;
 }
+
+int
+svn_utf_cstring_utf8_width(const char *cstr)
+{
+  int width = 0;
+
+  if (*cstr == '\0')
+    return 0;
+
+  /* Ensure the conversion below doesn't fail because of encoding errors. */
+  if (!svn_utf__cstring_is_valid(cstr))
+    return -1;
+
+  /* Convert the UTF-8 string to UTF-32 (UCS4) which is the format
+   * utf8proc_charwidth() expects, and get the width of each character.
+   * We don't need much error checking since the input is valid UTF-8. */
+  while (*cstr)
+    {
+      apr_int32_t ucs;
+      int w;
+
+      int nbytes = utf8proc_iterate((apr_byte_t*)cstr, -1, &ucs);
+
+      if (nbytes < 0)
+        return -1;
+
+      cstr += nbytes;
+
+      /* Determine the width of this character and add it to the total. */
+      w = utf8proc_charwidth(ucs);
+      if (w == -1)
+        return -1;
+      width += w;
+    }
+
+  return width;
+}

Modified: subversion/trunk/subversion/tests/libsvn_subr/utf-test.c
==============================================================================
--- subversion/trunk/subversion/tests/libsvn_subr/utf-test.c    Tue May 19 
17:24:33 2026        (r1934406)
+++ subversion/trunk/subversion/tests/libsvn_subr/utf-test.c    Tue May 19 
17:36:19 2026        (r1934407)
@@ -1020,7 +1020,7 @@ test_utf8_width(apr_pool_t *pool)
   const char *bom = "\xEF\xBB\xBF" "abc";
 
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width("abc123"), 6);
-  SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(fat_emojis), 3);
+  SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(fat_emojis), 6);
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(mixup), 10);
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(invalid), -1);
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(bom), 3);

Reply via email to