Author: brane
Date: Sun May 24 16:32:57 2026
New Revision: 1934559

Log:
Add grapheme-aware UTF-8 string trimming functions, with tests.

* subversion/include/private/svn_utf_private.h
  (svn_utf__cstring_width): New; get display width and string length.
  (svn_utf__cstring_trim_right,
   svn_utf__cstring_trim_left): New; strip graphemes from a string to trim
   it to the given display width.
* subversion/libsvn_subr/utf.c: Include limits.h.
  (svn_utf_cstring_utf8_width): Reimplement here, it doesn't need utf8proc.
* subversion/libsvn_subr/utf8proc.c: Remove include of limits.h.
  (svn_utf_cstring_utf8_width): Remove.
  (svn_utf__cstring_width, skip_graphemes,
   svn_utf__cstring_trim_right, svn_utf__cstring_trim_left): Implement.

* subversion/tests/libsvn_subr/utf-test.c
  (test_utf8_width): Also test svn_utf__cstring_width.
  (test_utf8_trim_right, test_utf8_trim_left): New test functions.
  (test_funcs): Register the new test functions.

Modified:
   subversion/trunk/subversion/include/private/svn_utf_private.h
   subversion/trunk/subversion/libsvn_subr/utf.c
   subversion/trunk/subversion/libsvn_subr/utf8proc.c
   subversion/trunk/subversion/tests/libsvn_subr/utf-test.c

Modified: subversion/trunk/subversion/include/private/svn_utf_private.h
==============================================================================
--- subversion/trunk/subversion/include/private/svn_utf_private.h       Sun May 
24 16:03:57 2026        (r1934558)
+++ subversion/trunk/subversion/include/private/svn_utf_private.h       Sun May 
24 16:32:57 2026        (r1934559)
@@ -320,6 +320,41 @@ svn_utf__cstring_utf8_grapheme_breaks(ap
                                       const char *cstr,
                                       apr_pool_t *pool);
 
+/* Return the display width of the UTF-8 string CSTR, or -1 if the string is
+ * not valid. If LENGTH is not NULL, set *LENGTH to the byte-wise length
+ * of CSTR; this the same as the value returned by strlen(CSTR).
+ */
+apr_ssize_t
+svn_utf__cstring_width(apr_size_t *length, const char *cstr);
+
+/* Trims the UTF-8 string CSTR to at most MAX_WIDTH visible Unicode glyphs,
+ * removing excess graphemes from the trailing (right) end of the string.
+ * Returns the display width of the trimmed substring, which can be less than
+ * MAX_WIDTH, and sets *STARTP and *ENDP to the start and one-past-the-end
+ * of the trimmed substring of CSTR.
+ *
+ * If CSTR is not a valid UTF-8 string, the returned value will be -1.
+ */
+apr_ssize_t
+svn_utf__cstring_trim_right(const char **startp,
+                            const char **endp,
+                            const char *cstr,
+                            apr_size_t max_width);
+
+/* Trims the UTF-8 string CSTR to at most MAX_WIDTH visible Unicode glyphs,
+ * removing excess graphemes from the leading (left) end of the string.
+ * Returns the display width of the trimmed substring, which can be less than
+ * MAX_WIDTH, and sets *STARTP and *ENDP to the start and one-past-the-end
+ * of the trimmed substring of CSTR.
+ *
+ * If CSTR is not a valid UTF-8 string, the returned value will be -1.
+ */
+apr_ssize_t
+svn_utf__cstring_trim_left(const char **startp,
+                           const char **endp,
+                           const char *cstr,
+                           apr_size_t max_width);
+
 /* Return a new string with a copy of @a cstr allocated in @a pool aligned to
  * the right side with spaces. This function takes UTF-8 multibyte encoding and
  * wcwidth into an account. The new string will be have exacly as much

Modified: subversion/trunk/subversion/libsvn_subr/utf.c
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf.c       Sun May 24 16:03:57 
2026        (r1934558)
+++ subversion/trunk/subversion/libsvn_subr/utf.c       Sun May 24 16:32:57 
2026        (r1934559)
@@ -26,6 +26,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
+#include <limits.h>
 
 #include <apr_strings.h>
 #include <apr_lib.h>
@@ -1040,6 +1041,18 @@ svn_utf_cstring_from_utf8_string(const c
   return err;
 }
 
+int
+svn_utf_cstring_utf8_width(const char *cstr)
+{
+  const apr_ssize_t width = svn_utf__cstring_width(NULL, cstr);
+
+  /* Check for return value overflow. It's unfortunate that we chose
+     to use 'int' for what is essentially a string length value. */
+  if (width > INT_MAX)
+    return -1;
+
+  return (int)width;
+}
 
 /* Insert the given UCS-4 VALUE into BUF at the given OFFSET. */
 static void

Modified: subversion/trunk/subversion/libsvn_subr/utf8proc.c
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf8proc.c  Sun May 24 16:03:57 
2026        (r1934558)
+++ subversion/trunk/subversion/libsvn_subr/utf8proc.c  Sun May 24 16:32:57 
2026        (r1934559)
@@ -23,7 +23,6 @@
 
 
 
-#include <limits.h>
 #include <apr_fnmatch.h>
 
 #include "svn_utf.h"
@@ -683,13 +682,18 @@ svn_utf__cstring_utf8_grapheme_breaks(ap
   return total_width;
 }
 
-int
-svn_utf_cstring_utf8_width(const char *cstr)
+apr_ssize_t
+svn_utf__cstring_width(apr_size_t *length, const char *cstr)
 {
+  const char *const start = cstr;
   apr_ssize_t width = 0;
 
   if (*cstr == '\0')
-    return 0;
+    {
+      if (length)
+        *length = 0;
+      return 0;
+    }
 
   /* Convert the UTF-8 string to UTF-32 (UCS4) which is the format
    * utf8proc_charwidth() expects, and get the width of each character.
@@ -709,12 +713,144 @@ svn_utf_cstring_utf8_width(const char *c
       width += utf8proc_charwidth(ucs);
     }
 
-  /* Check for return value overflow. It's unfortunate that we chose
-     to use 'int' for what is essentially a string length value. */
-  if (width > INT_MAX)
+  if (length)
+    *length = cstr - start;
+  return width;
+}
+
+/*
+ * Skip graphemes from the beginning of CSTR until their total width
+ * is MAX_WIDTH or less if CSTR ends earlier. If the sum of the skipped
+ * grapheme width is not exactly MAX_WIDTH, then:
+ *   if TRIM_RIGHT is TRUE, stop just _before_ MAX_WIDTH;
+ *   otherwise, stop just _after_ MAX_WIDTH.
+ * Return the total width of the skipped graphemes and set *ENDP to the
+ * start of the first grapheme in CSTR that was not skipped.
+ *
+ * CSTR may not be empty and MAX_WIDTH may not be 0.
+ * Return -1 if the examined part of CSTR is not valid UTF-8.
+ */
+static apr_ssize_t
+skip_graphemes(const char **endp,
+               const char *cstr,
+               apr_size_t max_width,
+               svn_boolean_t trim_right)
+{
+  apr_ssize_t current_width = 0;
+  apr_ssize_t next_width = 0;
+  utf8proc_int32_t state = 0;
+  utf8proc_int32_t codepoint1;
+  utf8proc_int32_t codepoint2;
+
+  const char *grapheme_end = cstr;
+  int grapheme_width = 0;
+
+  const utf8proc_uint8_t *utf8 = (const utf8proc_uint8_t *)grapheme_end;
+  utf8proc_ssize_t nbytes = utf8proc_iterate(utf8, -1, &codepoint1);
+
+  if (nbytes < 0)
     return -1;
 
-  return (int)width;
+  grapheme_width += utf8proc_charwidth(codepoint1);
+  utf8 += nbytes;
+
+  while(*utf8 && current_width < max_width)
+    {
+      nbytes = utf8proc_iterate(utf8, -1, &codepoint2);
+      if (nbytes < 0)
+        return -1;
+
+      if (utf8proc_grapheme_break_stateful(codepoint1, codepoint2, &state))
+        {
+          next_width = current_width + grapheme_width;
+          if (next_width > max_width)
+            /* Note: current_width < next_width */
+            break;
+
+          current_width = next_width;
+          grapheme_end = (const char *)utf8;
+          grapheme_width = 0;
+        }
+
+      codepoint1 = codepoint2;
+      grapheme_width += utf8proc_charwidth(codepoint1);
+      utf8 += nbytes;
+    }
+
+  /* Account for the width of the trailing part of the string. */
+  if (next_width == current_width)
+      next_width = current_width + grapheme_width;
+
+  if (current_width == max_width)
+    {
+      *endp = grapheme_end;
+      return current_width;
+    }
+  else
+    {
+      if (next_width <= max_width)
+        {
+          *endp = (const char *)utf8;
+          return next_width;
+        }
+      else
+        {
+          if (trim_right)
+            {
+              *endp = grapheme_end;
+              return current_width;
+            }
+          else
+            {
+              *endp = (const char *)utf8;
+              return next_width;
+            }
+        }
+    }
+}
+
+apr_ssize_t
+svn_utf__cstring_trim_right(const char **startp,
+                            const char **endp,
+                            const char *cstr,
+                            apr_size_t max_width)
+{
+  *startp = cstr;
+  if (!*cstr || max_width == 0)
+    {
+      *endp = cstr;
+      return 0;
+    }
+  return skip_graphemes(endp, cstr, max_width, TRUE);
+}
+
+apr_ssize_t
+svn_utf__cstring_trim_left(const char **startp,
+                           const char **endp,
+                           const char *cstr,
+                           apr_size_t max_width)
+{
+  apr_ssize_t width;
+  apr_size_t length;
+  apr_ssize_t skipped;
+
+  if (!*cstr || max_width == 0)
+    {
+      *startp = *endp = cstr;
+      return 0;
+    }
+
+  width = svn_utf__cstring_width(&length, cstr);
+  *endp = cstr + length;
+  if (width <= max_width)
+    {
+      *startp = cstr;
+      return width;
+    }
+  skipped = skip_graphemes(startp, cstr, width - max_width, FALSE);
+  if (skipped < 0)
+    return -1;
+  return width - skipped;
 }
 
 /* Advances CSTR by N printable UTF-8 characters */

Modified: subversion/trunk/subversion/tests/libsvn_subr/utf-test.c
==============================================================================
--- subversion/trunk/subversion/tests/libsvn_subr/utf-test.c    Sun May 24 
16:03:57 2026        (r1934558)
+++ subversion/trunk/subversion/tests/libsvn_subr/utf-test.c    Sun May 24 
16:32:57 2026        (r1934559)
@@ -1000,7 +1000,7 @@ test_utf_xfrm(apr_pool_t *pool)
   return SVN_NO_ERROR;
 }
 
-/* Test data for test_utf8_width and test_utf8_grapheme_breaks */
+/* Test data for width and trimming tests. */
 static const char *fat_emojis =
   "\xf0\x9f\xa5\xba"         /* three emojis, each two columns wide */
   "\xf0\x9f\x91\x89"
@@ -1022,12 +1022,237 @@ static const char *bom = "\xEF\xBB\xBF"
 static svn_error_t *
 test_utf8_width(apr_pool_t *pool)
 {
+  apr_size_t length = -147;     /* Magic number used to check... */
+
+  SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(invalid), -1);
+  SVN_TEST_INT_ASSERT(svn_utf__cstring_width(&length, invalid), -1);
+  SVN_TEST_INT_ASSERT(length, -147); /* ...that 'length' was not changed. */
+
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(""), 0);
+  SVN_TEST_INT_ASSERT(svn_utf__cstring_width(&length, ""), 0);
+  SVN_TEST_INT_ASSERT(length, 0);
+
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width("abc123"), 6);
+  SVN_TEST_INT_ASSERT(svn_utf__cstring_width(&length, "abc123"), 6);
+  SVN_TEST_INT_ASSERT(length, 6);
+
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(fat_emojis), 6);
+  SVN_TEST_INT_ASSERT(svn_utf__cstring_width(&length, fat_emojis), 6);
+  SVN_TEST_INT_ASSERT(length, strlen(fat_emojis));
+
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(mixup), 10);
-  SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(invalid), -1);
+  SVN_TEST_INT_ASSERT(svn_utf__cstring_width(&length, mixup), 10);
+  SVN_TEST_INT_ASSERT(length, strlen(mixup));
+
   SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(bom), 3);
+  SVN_TEST_INT_ASSERT(svn_utf__cstring_width(&length, bom), 3);
+  SVN_TEST_INT_ASSERT(length, strlen(bom));
+
+  return SVN_NO_ERROR;
+}
+
+static svn_error_t *
+test_utf8_trim_right(apr_pool_t *pool)
+{
+  apr_ssize_t width;
+  const char *start, *end;
+
+  /* Invalid and empty */
+  width = svn_utf__cstring_trim_right(&start, &end, invalid, 1);
+  SVN_TEST_INT_ASSERT(width, -1);
+
+  width = svn_utf__cstring_trim_right(&start, &end, invalid, 0);
+  SVN_TEST_INT_ASSERT(width, 0);
+  SVN_TEST_ASSERT(start == end);
+
+  width = svn_utf__cstring_trim_right(&start, &end, "", 1);
+  SVN_TEST_INT_ASSERT(width, 0);
+  SVN_TEST_ASSERT(start == end);
+
+  /* ASCII */
+  width = svn_utf__cstring_trim_right(&start, &end, "abc123", 10);
+  SVN_TEST_INT_ASSERT(width, 6);
+  SVN_TEST_INT_ASSERT(*start, 'a');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, 6);
+
+  width = svn_utf__cstring_trim_right(&start, &end, "abc123", 6);
+  SVN_TEST_INT_ASSERT(width, 6);
+  SVN_TEST_INT_ASSERT(*start, 'a');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, 6);
+
+  width = svn_utf__cstring_trim_right(&start, &end, "abc123", 3);
+  SVN_TEST_INT_ASSERT(width, 3);
+  SVN_TEST_INT_ASSERT(*start, 'a');
+  SVN_TEST_INT_ASSERT(*end, '1');
+  SVN_TEST_INT_ASSERT(end - start, 3);
+
+  /* Accented Latin */
+  width = svn_utf__cstring_trim_right(&start, &end, mixup, 15);
+  SVN_TEST_INT_ASSERT(width, 10);
+  SVN_TEST_INT_ASSERT(*start, 'S');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, strlen(mixup));
+
+  width = svn_utf__cstring_trim_right(&start, &end, mixup, 10);
+  SVN_TEST_INT_ASSERT(width, 10);
+  SVN_TEST_INT_ASSERT(*start, 'S');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, strlen(mixup));
+
+  width = svn_utf__cstring_trim_right(&start, &end, mixup, 7);
+  SVN_TEST_INT_ASSERT(width, 7);
+  SVN_TEST_INT_ASSERT(*start, 'S');
+  SVN_TEST_INT_ASSERT(*end, '\xe1');
+  SVN_TEST_INT_ASSERT(end - start, 23);
+
+  /* Emoji (two colmns wide glyphs) */
+  width = svn_utf__cstring_trim_right(&start, &end, fat_emojis, 10);
+  SVN_TEST_INT_ASSERT(width, 6);
+  SVN_TEST_INT_ASSERT(*start, '\xf0');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, strlen(fat_emojis));
+
+  width = svn_utf__cstring_trim_right(&start, &end, fat_emojis, 6);
+  SVN_TEST_INT_ASSERT(width, 6);
+  SVN_TEST_INT_ASSERT(*start, '\xf0');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, strlen(fat_emojis));
+
+  width = svn_utf__cstring_trim_right(&start, &end, fat_emojis, 4);
+  SVN_TEST_INT_ASSERT(width, 4);
+  SVN_TEST_INT_ASSERT(*start, '\xf0');
+  SVN_TEST_INT_ASSERT(*end, '\xf0');
+  SVN_TEST_INT_ASSERT(end - start, 8);
+
+  width = svn_utf__cstring_trim_right(&start, &end, fat_emojis, 3);
+  SVN_TEST_INT_ASSERT(width, 2);
+  SVN_TEST_INT_ASSERT(*start, '\xf0');
+  SVN_TEST_INT_ASSERT(*end, '\xf0');
+  SVN_TEST_INT_ASSERT(end - start, 4);
+
+  /* Byte order mark */
+  width = svn_utf__cstring_trim_right(&start, &end, bom, 5);
+  SVN_TEST_INT_ASSERT(width, 3);
+  SVN_TEST_INT_ASSERT(*start, '\xef');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, strlen(bom));
+
+  width = svn_utf__cstring_trim_right(&start, &end, bom, 3);
+  SVN_TEST_INT_ASSERT(width, 3);
+  SVN_TEST_INT_ASSERT(*start, '\xef');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, strlen(bom));
+
+  width = svn_utf__cstring_trim_right(&start, &end, bom, 2);
+  SVN_TEST_INT_ASSERT(width, 2);
+  SVN_TEST_INT_ASSERT(*start, '\xef');
+  SVN_TEST_INT_ASSERT(*end, 'c');
+  SVN_TEST_INT_ASSERT(end - start, 5);
+
+  return SVN_NO_ERROR;
+}
+
+static svn_error_t *
+test_utf8_trim_left(apr_pool_t *pool)
+{
+  apr_ssize_t width;
+  const char *start, *end;
+
+  /* Invalid and empty */
+  width = svn_utf__cstring_trim_left(&start, &end, invalid, 1);
+  SVN_TEST_INT_ASSERT(width, -1);
+
+  width = svn_utf__cstring_trim_left(&start, &end, invalid, 0);
+  SVN_TEST_INT_ASSERT(width, 0);
+  SVN_TEST_ASSERT(start == end);
+
+  width = svn_utf__cstring_trim_left(&start, &end, "", 1);
+  SVN_TEST_INT_ASSERT(width, 0);
+  SVN_TEST_ASSERT(start == end);
+
+  /* ASCII */
+  width = svn_utf__cstring_trim_left(&start, &end, "abc123", 10);
+  SVN_TEST_INT_ASSERT(width, 6);
+  SVN_TEST_INT_ASSERT(*start, 'a');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, 6);
+
+  width = svn_utf__cstring_trim_left(&start, &end, "abc123", 6);
+  SVN_TEST_INT_ASSERT(width, 6);
+  SVN_TEST_INT_ASSERT(*start, 'a');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, 6);
+
+  width = svn_utf__cstring_trim_left(&start, &end, "abc123", 3);
+  SVN_TEST_INT_ASSERT(width, 3);
+  SVN_TEST_INT_ASSERT(*start, '1');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, 3);
+
+  /* Accented Latin */
+  width = svn_utf__cstring_trim_left(&start, &end, mixup, 15);
+  SVN_TEST_INT_ASSERT(width, 10);
+  SVN_TEST_INT_ASSERT(*start, 'S');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, strlen(mixup));
+
+  width = svn_utf__cstring_trim_left(&start, &end, mixup, 10);
+  SVN_TEST_INT_ASSERT(width, 10);
+  SVN_TEST_INT_ASSERT(*start, 'S');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, strlen(mixup));
+
+  width = svn_utf__cstring_trim_left(&start, &end, mixup, 6);
+  SVN_TEST_INT_ASSERT(width, 6);
+  SVN_TEST_INT_ASSERT(*start, 'e');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, 21);
+
+  /* Emoji (two colmns wide glyphs) */
+  width = svn_utf__cstring_trim_left(&start, &end, fat_emojis, 10);
+  SVN_TEST_INT_ASSERT(width, 6);
+  SVN_TEST_INT_ASSERT(*start, '\xf0');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, strlen(fat_emojis));
+
+  width = svn_utf__cstring_trim_left(&start, &end, fat_emojis, 6);
+  SVN_TEST_INT_ASSERT(width, 6);
+  SVN_TEST_INT_ASSERT(*start, '\xf0');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, strlen(fat_emojis));
+
+  width = svn_utf__cstring_trim_left(&start, &end, fat_emojis, 4);
+  SVN_TEST_INT_ASSERT(width, 4);
+  SVN_TEST_INT_ASSERT(*start, '\xf0');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, 8);
+
+  width = svn_utf__cstring_trim_left(&start, &end, fat_emojis, 3);
+  SVN_TEST_INT_ASSERT(width, 2);
+  SVN_TEST_INT_ASSERT(*start, '\xf0');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, 4);
+
+  /* Byte order mark */
+  width = svn_utf__cstring_trim_left(&start, &end, bom, 5);
+  SVN_TEST_INT_ASSERT(width, 3);
+  SVN_TEST_INT_ASSERT(*start, '\xef');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, strlen(bom));
+
+  width = svn_utf__cstring_trim_left(&start, &end, bom, 3);
+  SVN_TEST_INT_ASSERT(width, 3);
+  SVN_TEST_INT_ASSERT(*start, '\xef');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, strlen(bom));
+
+  width = svn_utf__cstring_trim_left(&start, &end, bom, 2);
+  SVN_TEST_INT_ASSERT(width, 2);
+  SVN_TEST_INT_ASSERT(*start, 'b');
+  SVN_TEST_INT_ASSERT(*end, '\0');
+  SVN_TEST_INT_ASSERT(end - start, 2);
 
   return SVN_NO_ERROR;
 }
@@ -1151,6 +1376,10 @@ static struct svn_test_descriptor_t test
                    "test svn_utf__xfrm"),
     SVN_TEST_PASS2(test_utf8_width,
                    "test svn_utf_cstring_utf8_width"),
+    SVN_TEST_PASS2(test_utf8_trim_right,
+                   "test grapheme-aware right trim"),
+    SVN_TEST_PASS2(test_utf8_trim_left,
+                   "test grapheme-aware left trim"),
     SVN_TEST_PASS2(test_utf8_grapheme_breaks,
                    "test utf8 grapheme breaks"),
     SVN_TEST_PASS2(test_utf8_align,

Reply via email to