The attached is from Lukáš Zaoral,
who updated the expand/unexpand implementation in Fedora
to use Collin's mbbuf module
(to fix crash bugs in the original i18n implementation
 (https://bugzilla.redhat.com/2443041))

I've confirmed tests and syntax checks pass with this.

I plan to push this later, and we can iterate
on tests etc. after that.

cheers,
Padraig
From 8baccfd2dd4e1005505aeeb74a34feb6bf48be4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Zaoral?= <[email protected]>
Date: Fri, 6 Mar 2026 14:13:17 +0000
Subject: [PATCH] expand,unexpand: support multi-byte input

* src/expand.c: Use mbbuf to support multi-byte input.
* src/unexpand.c: Likewise.
* tests/expand/mb.sh: New multi-byte test.
* tests/unexpand/mb.sh: Likewise.
* tests/local.mk: Reference new tests.
* NEWS: Mention the improvement.
---
 src/expand.c         |  38 +++++++---
 src/unexpand.c       |  55 +++++++++-----
 tests/expand/mb.sh   | 171 +++++++++++++++++++++++++++++++++++++++++++
 tests/local.mk       |   2 +
 tests/unexpand/mb.sh | 163 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 399 insertions(+), 30 deletions(-)
 create mode 100755 tests/expand/mb.sh
 create mode 100755 tests/unexpand/mb.sh

diff --git a/src/expand.c b/src/expand.c
index cbf659c17..6d4223c9b 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -37,7 +37,11 @@
 #include <stdio.h>
 #include <getopt.h>
 #include <sys/types.h>
+
 #include "system.h"
+#include "ioblksize.h"
+#include "mcel.h"
+#include "mbbuf.h"
 #include "expand-common.h"
 
 /* The official name of this program (e.g., no 'g' prefix).  */
@@ -103,10 +107,14 @@ expand (void)
   if (!fp)
     return;
 
+  static char line_in[IO_BUFSIZE];
+  mbbuf_t mbbuf;
+  mbbuf_init (&mbbuf, line_in, sizeof line_in, fp);
+
   while (true)
     {
       /* Input character, or EOF.  */
-      int c;
+      mcel_t g;
 
       /* If true, perform translations.  */
       bool convert = true;
@@ -126,12 +134,16 @@ expand (void)
 
       do
         {
-          while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
-            continue;
+          while ((g = mbbuf_get_char (&mbbuf)).ch == MBBUF_EOF
+                 && (fp = next_file (fp)))
+            mbbuf_init (&mbbuf, line_in, sizeof line_in, fp);
 
           if (convert)
             {
-              if (c == '\t')
+              convert &= convert_entire_line
+                         || !! (c32isblank (g.ch) && ! c32isnbspace (g.ch));
+
+              if (g.ch == '\t')
                 {
                   /* Column the next input tab stop is on.  */
                   bool last_tab;
@@ -142,9 +154,12 @@ expand (void)
                     if (putchar (' ') < 0)
                       write_error ();
 
-                  c = ' ';
+                  if (putchar (' ') < 0)
+                    write_error ();
+
+                  continue;
                 }
-              else if (c == '\b')
+              else if (g.ch == '\b')
                 {
                   /* Go back one column, and force recalculation of the
                      next tab stop.  */
@@ -153,20 +168,21 @@ expand (void)
                 }
               else
                 {
-                  if (ckd_add (&column, column, 1))
+                  int width = c32width (g.ch);
+                  if (ckd_add (&column, column, width < 0 ? 1 : width))
                     error (EXIT_FAILURE, 0, _("input line is too long"));
                 }
 
-              convert &= convert_entire_line || !! isblank (c);
             }
 
-          if (c < 0)
+          if (g.ch == MBBUF_EOF)
             return;
 
-          if (putchar (c) < 0)
+          fwrite (mbbuf_char_offset (&mbbuf, g), sizeof (char), g.len, stdout);
+          if (ferror (stdout))
             write_error ();
         }
-      while (c != '\n');
+      while (g.ch != '\n');
     }
 }
 
diff --git a/src/unexpand.c b/src/unexpand.c
index 54b3ae2fe..16d0f0031 100644
--- a/src/unexpand.c
+++ b/src/unexpand.c
@@ -38,7 +38,11 @@
 #include <stdio.h>
 #include <getopt.h>
 #include <sys/types.h>
+
 #include "system.h"
+#include "ioblksize.h"
+#include "mbbuf.h"
+#include "mcel.h"
 #include "expand-common.h"
 
 /* The official name of this program (e.g., no 'g' prefix).  */
@@ -120,15 +124,19 @@ unexpand (void)
   if (!fp)
     return;
 
+  static char line_in[IO_BUFSIZE];
+  mbbuf_t mbbuf;
+  mbbuf_init (&mbbuf, line_in, sizeof line_in, fp);
+
   /* The worst case is a non-blank character, then one blank, then a
      tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so
      allocate MAX_COLUMN_WIDTH bytes to store the blanks.  */
-  pending_blank = ximalloc (max_column_width);
+  pending_blank = ximalloc (max_column_width * sizeof (char) * MB_LEN_MAX);
 
   while (true)
     {
       /* Input character, or EOF.  */
-      int c;
+      mcel_t g;
 
       /* If true, perform translations.  */
       bool convert = true;
@@ -140,6 +148,9 @@ unexpand (void)
       /* Column of next input character.  */
       colno column = 0;
 
+      /* Column the next input tab stop is on.  */
+      colno next_tab_column = 0;
+
       /* Index in TAB_LIST of next tab stop to examine.  */
       idx_t tab_index = 0;
 
@@ -159,28 +170,27 @@ unexpand (void)
 
       do
         {
-          while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
-            continue;
+          while ((g = mbbuf_get_char (&mbbuf)).ch == MBBUF_EOF
+                 && (fp = next_file (fp)))
+            mbbuf_init (&mbbuf, line_in, sizeof line_in, fp);
 
           if (convert)
             {
-              bool blank = !! isblank (c);
+              bool blank = !! (c32isblank (g.ch) && ! c32isnbspace (g.ch));
 
               if (blank)
                 {
                   bool last_tab;
 
-                  /* Column the next input tab stop is on.  */
-                  colno next_tab_column = get_next_tab_column (column,
-                                                               &tab_index,
-                                                               &last_tab);
+                  next_tab_column = get_next_tab_column (column, &tab_index,
+                                                         &last_tab);
 
                   if (last_tab)
                     convert = false;
 
                   if (convert)
                     {
-                      if (c == '\t')
+                      if (g.ch == '\t')
                         {
                           column = next_tab_column;
 
@@ -189,7 +199,7 @@ unexpand (void)
                         }
                       else
                         {
-                          column++;
+                          column += c32width (g.ch);
 
                           if (! (prev_blank && column == next_tab_column))
                             {
@@ -197,13 +207,18 @@ unexpand (void)
                                  will be replaced by tabs.  */
                               if (column == next_tab_column)
                                 one_blank_before_tab_stop = true;
-                              pending_blank[pending++] = c;
+                              memcpy (pending_blank + pending,
+                                      mbbuf_char_offset (&mbbuf, g), g.len);
+                              pending += g.len;
                               prev_blank = true;
                               continue;
                             }
 
                           /* Replace the pending blanks by a tab or two.  */
-                          pending_blank[0] = c = '\t';
+                          g.len = 0;
+                          if (putc ('\t', stdout) < 0)
+                            write_error ();
+                          pending_blank[0] = '\t';
                         }
 
                       /* Discard pending blanks, unless it was a single
@@ -211,17 +226,18 @@ unexpand (void)
                       pending = one_blank_before_tab_stop;
                     }
                 }
-              else if (c == '\b')
+              else if (g.ch == '\b')
                 {
                   /* Go back one column, and force recalculation of the
                      next tab stop.  */
                   column -= !!column;
+                  next_tab_column = column;
                   tab_index -= !!tab_index;
                 }
               else
                 {
-                  column++;
-                  if (!column)
+                  int width = c32width (g.ch);
+                  if (ckd_add (&column, column, width < 0 ? 1 : width))
                     error (EXIT_FAILURE, 0, _("input line is too long"));
                 }
 
@@ -239,16 +255,17 @@ unexpand (void)
               convert &= convert_entire_line || blank;
             }
 
-          if (c < 0)
+          if (g.ch == MBBUF_EOF)
             {
               free (pending_blank);
               return;
             }
 
-          if (putchar (c) < 0)
+          fwrite (mbbuf_char_offset (&mbbuf, g), sizeof (char), g.len, stdout);
+          if (ferror (stdout))
             write_error ();
         }
-      while (c != '\n');
+      while (g.ch != '\n');
     }
 }
 
diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh
new file mode 100755
index 000000000..10ea160f4
--- /dev/null
+++ b/tests/expand/mb.sh
@@ -0,0 +1,171 @@
+#!/bin/sh
+
+# Copyright (C) 2012-2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ expand printf
+
+export LC_ALL=en_US.UTF-8
+
+#input containing multibyte characters
+cat <<\EOF > in || framework_failure_
+1234567812345678123456781
+.       .       .       .
+a	b	c	d
+.       .       .       .
+ä	ö	ü	ß
+.       .       .       .
+EOF
+env printf '   äöü\t.    öüä.   \tä xx\n' >> in || framework_failure_
+
+cat <<\EOF > exp || framework_failure_
+1234567812345678123456781
+.       .       .       .
+a       b       c       d
+.       .       .       .
+ä       ö       ü       ß
+.       .       .       .
+   äöü  .    öüä.       ä xx
+EOF
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#multiple files as an input
+cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+.       .       .       .
+a       b       c       d
+.       .       .       .
+ä       ö       ü       ß
+.       .       .       .
+   äöü  .    öüä.       ä xx
+EOF
+
+expand ./in ./in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#test characters with display widths != 1
+env printf '12345678
+e\t|ascii(1)
+\u00E9\t|composed(1)
+e\u0301\t|decomposed(1)
+\u3000\t|ideo-space(2)
+\uFF0D\t|full-hypen(2)
+' > in || framework_failure_
+
+env printf '12345678
+e       |ascii(1)
+\u00E9       |composed(1)
+e\u0301       |decomposed(1)
+\u3000      |ideo-space(2)
+\uFF0D      |full-hypen(2)
+' > exp || framework_failure_
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#shouldn't fail with "input line too long"
+#when a line starts with a control character
+env printf '\n' > in || framework_failure_
+
+expand < in > out || fail=1
+compare in out > /dev/null 2>&1 || fail=1
+
+#non-Unicode characters interspersed between Unicode ones
+env printf '12345678
+\t\xFF|
+\xFF\t|
+\t\xFFä|
+ä\xFF\t|
+\tä\xFF|
+\xFF\tä|
+äbcdef\xFF\t|
+' > in || framework_failure_
+
+env printf '12345678
+        \xFF|
+\xFF       |
+        \xFFä|
+ä\xFF      |
+        ä\xFF|
+\xFF       ä|
+äbcdef\xFF |
+' > exp || framework_failure_
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+
+
+#BOM header test 1
+env printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
+1234567812345678123456781
+.       .       .       .
+a	b	c	d
+.       .       .       .
+ä	ö	ü	ß
+.       .       .       .
+EOF
+env printf '   äöü\t.    öüä.   \tä xx\n' >> in || framework_failure_
+
+env printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+.       .       .       .
+a       b       c       d
+.       .       .       .
+ä       ö       ü       ß
+.       .       .       .
+   äöü  .    öüä.       ä xx
+EOF
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+env printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_
+1234567812345678123456781
+.       .       .       .
+a	b	c	d
+.       .       .       .
+ä	ö	ü	ß
+.       .       .       .
+EOF
+env printf '   äöü\t.    öüä.   \tä xx\n' >> in1 || framework_failure_
+
+
+env printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+.       .       .       .
+a       b       c       d
+.       .       .       .
+ä       ö       ü       ß
+.       .       .       .
+   äöü  .    öüä.       ä xx
+EOF
+env printf '\xEF\xBB\xBF' >> exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+.       .       .       .
+a       b       c       d
+.       .       .       .
+ä       ö       ü       ß
+.       .       .       .
+   äöü  .    öüä.       ä xx
+EOF
+
+expand in1 in1 > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+Exit $fail
diff --git a/tests/local.mk b/tests/local.mk
index 2d82a0de9..ef263e992 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -342,6 +342,7 @@ all_tests =					\
   tests/env/env-S-script.sh			\
   tests/expand/expand.pl			\
   tests/expand/bounded-memory.sh		\
+  tests/expand/mb.sh				\
   tests/expr/expr.pl				\
   tests/expr/expr-multibyte.pl			\
   tests/factor/factor.pl			\
@@ -504,6 +505,7 @@ all_tests =					\
   tests/misc/usage_vs_refs.sh			\
   tests/unexpand/unexpand.pl			\
   tests/unexpand/bounded-memory.sh		\
+  tests/unexpand/mb.sh				\
   tests/uniq/uniq.pl				\
   tests/uniq/uniq-perf.sh			\
   tests/uniq/uniq-collate.sh			\
diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh
new file mode 100755
index 000000000..dde30b594
--- /dev/null
+++ b/tests/unexpand/mb.sh
@@ -0,0 +1,163 @@
+#!/bin/sh
+
+# Copyright (C) 2012-2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ unexpand printf
+
+export LC_ALL=en_US.UTF-8
+
+#input containing multibyte characters
+cat > in <<\EOF
+1234567812345678123456781
+.       .       .       .
+a       b       c       d
+.       .       .       .
+ä       ö       ü       ß
+.       .       .       .
+   äöü  .    öüä.       ä xx
+EOF
+
+cat > exp <<\EOF
+1234567812345678123456781
+.	.	.	.
+a	b	c	d
+.	.	.	.
+ä	ö	ü	ß
+.	.	.	.
+   äöü	.    öüä.	ä xx
+EOF
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+
+#multiple files as an input
+cat >> exp <<\EOF
+1234567812345678123456781
+.	.	.	.
+a	b	c	d
+.	.	.	.
+ä	ö	ü	ß
+.	.	.	.
+   äöü	.    öüä.	ä xx
+EOF
+
+
+unexpand -a ./in ./in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#test characters with a display width larger than 1
+
+env printf '12345678
+e       |ascii(1)
+\u00E9       |composed(1)
+e\u0301       |decomposed(1)
+\u3000      |ideo-space(2)
+\u3000\u3000\u3000\u3000|ideo-space(2) * 4
+\uFF0D      |full-hypen(2)
+' > in || framework_failure_
+
+env printf '12345678
+e\t|ascii(1)
+\u00E9\t|composed(1)
+e\u0301\t|decomposed(1)
+\t|ideo-space(2)
+\t|ideo-space(2) * 4
+\uFF0D\t|full-hypen(2)
+' > exp || framework_failure_
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#test input where a blank of width > 1 is not being substituted
+in="$(LC_ALL=en_US.UTF-8 env printf ' \u3000  ö       ü       ß')"
+exp='    ö	     ü	     ß'
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#non-Unicode characters interspersed between Unicode ones
+env printf '12345678
+        \xFF|
+\xFF       |
+        \xFFä|
+ä\xFF      |
+        ä\xFF|
+\xFF       ä|
+äbcde\xFF  |
+' > in || framework_failure_
+
+env printf '12345678
+\t\xFF|
+\xFF\t|
+\t\xFFä|
+ä\xFF\t|
+\tä\xFF|
+\xFF\tä|
+äbcde\xFF\t|
+' > exp || framework_failure_
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#BOM header test 1
+env printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
+1234567812345678123456781
+.       .       .       .
+a       b       c       d
+.       .       .       .
+ä       ö       ü       ß
+.       .       .       .
+   äöü  .    öüä.       ä xx
+EOF
+
+env printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+.	.	.	.
+a	b	c	d
+.	.	.	.
+ä	ö	ü	ß
+.	.	.	.
+   äöü	.    öüä.	ä xx
+EOF
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+env printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+.	.	.	.
+a	b	c	d
+.	.	.	.
+ä	ö	ü	ß
+.	.	.	.
+   äöü	.    öüä.	ä xx
+EOF
+env printf "\xEF\xBB\xBF" >> exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+.	.	.	.
+a	b	c	d
+.	.	.	.
+ä	ö	ü	ß
+.	.	.	.
+   äöü	.    öüä.	ä xx
+EOF
+
+unexpand -a ./in ./in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+Exit $fail
-- 
2.53.0

Reply via email to