I often need to count characters, so wanted to make ruler helper like:

  $ ruler() { yes 123456789 | head -n100 | src/paste -s -d¹²³⁴⁵⁶⁷⁸⁹⁰ | cut 
-c-$COLUMNS; }

So I could do things like:

  $ yes foo | head -n10 | paste -s -d.; ruler
  foo.foo.foo.foo.foo.foo.foo.foo.foo.foo
  123456789¹123456789²123456789³123456789⁴123456789⁵123456789⁶123456789⁷

But paste(1) needs multi-byte support for that,
which the attached implements.

cheers,
Padraig
From c40882965cd2620a57dd34638d4182a1373a6a58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Mon, 12 Jan 2026 23:41:29 +0000
Subject: [PATCH] paste: support multi-byte delimiters

* src/paste.c (collapse_escapes): This is the central --delimiters
parsing function, so adjust to handle multi-byte chars with
mcel_scanz().  Populate a delimiters length array to support
characters of differing lengths.
(paste_serial): Use the delimiters length array to output
the appropriate delimiter.
(paste_parallel): Likewise.
* tests/paste/multi-byte.sh: A new test.
* tests/local.mk: Reference the new test.
* NEWS: Mention the improvement.
---
 NEWS                           |   2 +
 src/paste.c                    | 177 +++++++++++++++++++--------------
 tests/local.mk                 |   3 +-
 tests/paste/multi-byte.sh      | 105 +++++++++++++++++++
 tests/{misc => paste}/paste.pl |   0
 5 files changed, 213 insertions(+), 74 deletions(-)
 create mode 100755 tests/paste/multi-byte.sh
 rename tests/{misc => paste}/paste.pl (100%)

diff --git a/NEWS b/NEWS
index 1ccc52426..b4031caa0 100644
--- a/NEWS
+++ b/NEWS
@@ -81,6 +81,8 @@ GNU coreutils NEWS                                    -*- outline -*-
   'du' now processes directories with 10,000 or more entries up to 9 times
   faster on the Lustre file system.
 
+  'paste' now supports multi-byte --delimiters characters.
+
   'pinky' will now exit immediately upon receiving a write error, which is
   significant when reading large plan or project files.
 
diff --git a/src/paste.c b/src/paste.c
index f48f57f6b..01ed596e2 100644
--- a/src/paste.c
+++ b/src/paste.c
@@ -42,6 +42,7 @@
 #include <sys/types.h>
 #include "system.h"
 #include "fadvise.h"
+#include "mcel.h"
 
 /* The official name of this program (e.g., no 'g' prefix).  */
 #define PROGRAM_NAME "paste"
@@ -50,9 +51,6 @@
   proper_name ("David M. Ihnat"), \
   proper_name ("David MacKenzie")
 
-/* Indicates that no delimiter should be added in the current position. */
-#define EMPTY_DELIM '\0'
-
 /* If nonzero, we have read standard input at some point. */
 static bool have_read_stdin;
 
@@ -60,11 +58,16 @@ static bool have_read_stdin;
    corresponding lines from each file in parallel. */
 static bool serial_merge;
 
-/* The delimiters between lines of input files (used cyclically). */
+/* The delimiters between lines of input files (used cyclically).
+   This stores the raw bytes of all delimiters concatenated.  */
 static char *delims;
 
-/* A pointer to the character after the end of 'delims'. */
-static char const *delim_end;
+/* Length of each delimiter in bytes (supports multi-byte characters).
+   A length of 0 indicates no delimiter at this position (from \0 escape).  */
+static size_t *delim_lens;
+
+/* Number of delimiters.  */
+static idx_t num_delims;
 
 static unsigned char line_delim = '\n';
 
@@ -78,10 +81,10 @@ static struct option const longopts[] =
   {nullptr, 0, nullptr, 0}
 };
 
-/* Set globals delims and delim_end.  Copy STRPTR to DELIMS, converting
-   backslash representations of special characters in STRPTR to their actual
-   values. The set of possible backslash characters has been expanded beyond
-   that recognized by the Unix version.
+/* Set globals delims, delim_lens, and num_delims.
+   Process STRPTR converting backslash representations of special characters
+   to their actual values.  The set of possible backslash characters has been
+   expanded beyond that recognized by the Unix version.
    Return 0 upon success.
    If the string ends in an odd number of backslashes, ignore the
    final backslash and return nonzero.  */
@@ -93,62 +96,65 @@ collapse_escapes (char const *strptr)
   bool backslash_at_end = false;
 
   delims = strout;
+  delim_lens = xnmalloc (MAX (1, strlen (strptr)), sizeof *delim_lens);
+
+  char const *s = strptr;
+  idx_t idx = 0;
 
-  while (*strptr)
+  while (*s)
     {
-      if (*strptr != '\\')	/* Is it an escape character? */
-        *strout++ = *strptr++;	/* No, just transfer it. */
-      else
+      if (*s == '\\')
         {
-          switch (*++strptr)
+          s++;
+          if (*s == '\0')
             {
-            case '0':
-              *strout++ = EMPTY_DELIM;
-              break;
-
-            case 'b':
-              *strout++ = '\b';
-              break;
-
-            case 'f':
-              *strout++ = '\f';
-              break;
-
-            case 'n':
-              *strout++ = '\n';
-              break;
-
-            case 'r':
-              *strout++ = '\r';
+              backslash_at_end = true;
               break;
+            }
+          else if (*s == '0')
+            {
+              /* Empty delimiter at this position.  */
+              s++;
+              delim_lens[idx++] = 0;
+            }
+          else
+            {
+              switch (*s)
+                {
+                case 'b': *strout++ = '\b'; break;
+                case 'f': *strout++ = '\f'; break;
+                case 'n': *strout++ = '\n'; break;
+                case 'r': *strout++ = '\r'; break;
+                case 't': *strout++ = '\t'; break;
+                case 'v': *strout++ = '\v'; break;
+                case '\\': *strout++ = '\\'; break;
+                default: goto copy_character;
+                }
 
-            case 't':
-              *strout++ = '\t';
-              break;
+              s++;
+              delim_lens[idx++] = 1;
+            }
 
-            case 'v':
-              *strout++ = '\v';
-              break;
+          continue;
+        }
 
-            case '\\':
-              *strout++ = '\\';
-              break;
+      copy_character:
+      mcel_t g = mcel_scanz (s);
+      strout = mempcpy (strout, s, g.len);
+      s += g.len;
+      delim_lens[idx++] = g.len;
+    }
 
-            case '\0':
-              backslash_at_end = true;
-              goto done;
+  *strout = '\0';
 
-            default:
-              *strout++ = *strptr;
-              break;
-            }
-          strptr++;
-        }
+  if (idx == 0)
+    {
+      delim_lens[0] = 0;
+      idx = 1;
     }
 
- done:
+  num_delims = idx;
 
-  delim_end = strout;
   return backslash_at_end ? 1 : 0;
 }
 
@@ -161,6 +167,16 @@ xputchar (char c)
     write_error ();
 }
 
+/* Output the delimiter at DELIMPTR with length LEN.
+   If LEN is 0, nothing is output (empty delimiter from \0 escape).  */
+
+static inline void
+output_delim (char const *delimptr, size_t len)
+{
+  if (len > 0 && fwrite (delimptr, 1, len, stdout) != len)
+    write_error ();
+}
+
 /* Perform column paste on the NFILES files named in FNAMPTR.
    Return true if successful, false if one or more files could not be
    opened or read. */
@@ -171,9 +187,9 @@ paste_parallel (size_t nfiles, char **fnamptr)
   bool ok = true;
   /* If all files are just ready to be closed, or will be on this
      round, the string of delimiters must be preserved.
-     delbuf[0] through delbuf[nfiles]
-     store the delimiters for closed files. */
-  char *delbuf = xmalloc (nfiles + 2);
+     delbuf stores the delimiter bytes for closed files.
+     Size it to hold up to (nfiles - 1) delimiters.  */
+  char *delbuf = xmalloc ((nfiles - 1) * MB_CUR_MAX + 1);
 
   /* Streams open to the files to process; null if the corresponding
      stream is closed.  */
@@ -218,8 +234,9 @@ paste_parallel (size_t nfiles, char **fnamptr)
     {
       /* Set up for the next line. */
       bool somedone = false;
-      char const *delimptr = delims;
-      size_t delims_saved = 0;	/* Number of delims saved in 'delbuf'. */
+      idx_t delimidx = 0;              /* Current delimiter index.  */
+      idx_t delimoff = 0;              /* Current offset into delims.  */
+      idx_t delims_saved = 0;          /* Bytes saved in 'delbuf'. */
 
       for (size_t i = 0; i < nfiles && files_open; i++)
         {
@@ -292,10 +309,18 @@ paste_parallel (size_t nfiles, char **fnamptr)
               else
                 {
                   /* Closed file; add delimiter to 'delbuf'. */
-                  if (*delimptr != EMPTY_DELIM)
-                    delbuf[delims_saved++] = *delimptr;
-                  if (++delimptr == delim_end)
-                    delimptr = delims;
+                  size_t len = delim_lens[delimidx];
+                  if (len > 0)
+                    {
+                      memcpy (delbuf + delims_saved, delims + delimoff, len);
+                      delims_saved += len;
+                    }
+                  delimoff += len;
+                  if (++delimidx == num_delims)
+                    {
+                      delimidx = 0;
+                      delimoff = 0;
+                    }
                 }
             }
           else
@@ -308,10 +333,13 @@ paste_parallel (size_t nfiles, char **fnamptr)
                 {
                   if (chr != line_delim && chr != EOF)
                     xputchar (chr);
-                  if (*delimptr != EMPTY_DELIM)
-                    xputchar (*delimptr);
-                  if (++delimptr == delim_end)
-                    delimptr = delims;
+                  output_delim (delims + delimoff, delim_lens[delimidx]);
+                  delimoff += delim_lens[delimidx];
+                  if (++delimidx == num_delims)
+                    {
+                      delimidx = 0;
+                      delimoff = 0;
+                    }
                 }
               else
                 {
@@ -337,7 +365,6 @@ paste_serial (size_t nfiles, char **fnamptr)
 {
   bool ok = true;	/* false if open or read errors occur. */
   int charnew, charold; /* Current and previous char read. */
-  char const *delimptr;	/* Current delimiter char. */
   FILE *fileptr;	/* Open for reading current file. */
 
   for (; nfiles; nfiles--, fnamptr++)
@@ -361,7 +388,8 @@ paste_serial (size_t nfiles, char **fnamptr)
           fadvise (fileptr, FADVISE_SEQUENTIAL);
         }
 
-      delimptr = delims;	/* Set up for delimiter string. */
+      idx_t delimidx = 0;      /* Current delimiter index.  */
+      idx_t delimoff = 0;      /* Current offset into delims.  */
 
       charold = getc (fileptr);
       saved_errno = errno;
@@ -378,11 +406,13 @@ paste_serial (size_t nfiles, char **fnamptr)
               /* Process the old character. */
               if (charold == line_delim)
                 {
-                  if (*delimptr != EMPTY_DELIM)
-                    xputchar (*delimptr);
-
-                  if (++delimptr == delim_end)
-                    delimptr = delims;
+                  output_delim (delims + delimoff, delim_lens[delimidx]);
+                  delimoff += delim_lens[delimidx];
+                  if (++delimidx == num_delims)
+                    {
+                      delimidx = 0;
+                      delimoff = 0;
+                    }
                 }
               else
                 xputchar (charold);
@@ -520,6 +550,7 @@ main (int argc, char **argv)
              (nfiles, &argv[optind]));
 
   free (delims);
+  free (delim_lens);
 
   if (have_read_stdin && fclose (stdin) == EOF)
     error (EXIT_FAILURE, errno, "-");
diff --git a/tests/local.mk b/tests/local.mk
index 01db53678..4bae33c6e 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -377,9 +377,10 @@ all_tests =					\
   tests/od/od-j.sh				\
   tests/od/od-multiple-t.sh			\
   tests/od/od-x8.sh				\
-  tests/misc/paste.pl				\
   tests/misc/pathchk.sh				\
   tests/misc/printenv.sh			\
+  tests/paste/paste.pl				\
+  tests/paste/multi-byte.sh			\
   tests/printf/printf.sh			\
   tests/printf/printf-cov.pl			\
   tests/printf/printf-hex.sh			\
diff --git a/tests/paste/multi-byte.sh b/tests/paste/multi-byte.sh
new file mode 100755
index 000000000..e22f9e3f6
--- /dev/null
+++ b/tests/paste/multi-byte.sh
@@ -0,0 +1,105 @@
+#!/bin/sh
+# Test multi-byte delimiter handling in paste
+
+# Copyright (C) 2026 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ paste printf
+
+# Test UTF-8 multi-byte delimiters
+export LC_ALL=en_US.UTF-8
+
+# Skip if UTF-8 is not supported
+test "$(locale charmap 2>/dev/null)" = UTF-8 ||
+  skip_ 'UTF-8 locale not available'
+
+# UTF-8 test: 2-byte character (e.g., cent sign)
+delim_cent=$(env printf '\xc2\xa2')
+# UTF-8 test: 3-byte character (e.g., euro sign)
+delim_euro=$(env printf '\xe2\x82\xac')
+# UTF-8 test: 4-byte character (e.g., emoji: U+1F600)
+delim_emoji=$(env printf '\xf0\x9f\x98\x80')
+
+printf '1\n2\n' > f1 || framework_failure_
+printf 'a\nb\n' > f2 || framework_failure_
+
+# Test parallel mode with multi-byte delimiters
+for delim in "$delim_cent" "$delim_euro" "$delim_emoji"; do
+  paste -d "$delim" f1 f2 > out || fail=1
+  printf "1${delim}a\n2${delim}b\n" > exp || framework_failure_
+  compare exp out || fail=1
+done
+
+# Test serial mode with multi-byte delimiters
+printf '1\n2\n3\n' > f3 || framework_failure_
+for delim in "$delim_cent" "$delim_euro"; do
+  paste -s -d "$delim" f3 > out || fail=1
+  printf "1${delim}2${delim}3\n" > exp || framework_failure_
+  compare exp out || fail=1
+done
+
+# Test multiple multi-byte delimiters cycling
+printf 'a\nb\nc\n' > f4 || framework_failure_
+printf '1\n2\n3\n' > f5 || framework_failure_
+printf 'x\ny\nz\n' > f6 || framework_failure_
+paste -d "${delim_cent}${delim_euro}" f4 f5 f6 > out || fail=1
+printf "a${delim_cent}1${delim_euro}x\n" > exp || framework_failure_
+printf "b${delim_cent}2${delim_euro}y\n" >> exp || framework_failure_
+printf "c${delim_cent}3${delim_euro}z\n" >> exp || framework_failure_
+compare exp out || fail=1
+
+# Test multi-byte delimiters mixed with empty delimiter (\0)
+paste -s -d "${delim_euro}\\0" f3 > out || fail=1
+printf "1${delim_euro}23\n" > exp || framework_failure_
+compare exp out || fail=1
+
+# Test invalid UTF-8 sequences are still passed through
+delims_invalid=$(bad_unicode)
+delim_invalid=$(env printf '%s' "$delims_invalid" | cut -b1)
+paste -d "$delims_invalid" f1 f2 > out || fail=1
+printf "1${delim_invalid}a\n2${delim_invalid}b\n" > exp || framework_failure_
+compare exp out || fail=1
+
+# Test that \<multi-byte char> is treated like <multi-byte char>
+# (unknown escapes pass through the escaped character)
+paste -d "\\${delim_euro}" f1 f2 > out || fail=1
+paste -d "$delim_euro" f1 f2 > exp || fail=1
+compare exp out || fail=1
+
+
+# Test GB18030 encoding if available
+export LC_ALL=zh_CN.gb18030
+
+if test "$(locale charmap 2>/dev/null | sed 's/gb/GB/')" = GB18030; then
+  # GB18030 2-byte character (e.g., 0xA2 0xE3 is a valid GB18030 char)
+  delim_gb18030=$(env printf '\xa2\xe3')
+
+  paste -d "$delim_gb18030" f1 f2 > out || fail=1
+  printf "1${delim_gb18030}a\n2${delim_gb18030}b\n" > exp || framework_failure_
+  compare exp out || fail=1
+
+  paste -s -d "$delim_gb18030" f3 > out || fail=1
+  printf "1${delim_gb18030}2${delim_gb18030}3\n" > exp || framework_failure_
+  compare exp out || fail=1
+
+  # Note 0xFF is invalid in GB18030, but we support all single byte delimiters
+  delim_ff=$(env printf '\xff')
+  paste -d "$delim_ff" f1 f2 > out || fail=1
+  printf "1${delim_ff}a\n2${delim_ff}b\n" > exp || framework_failure_
+  compare exp out || fail=1
+fi
+
+Exit $fail
diff --git a/tests/misc/paste.pl b/tests/paste/paste.pl
similarity index 100%
rename from tests/misc/paste.pl
rename to tests/paste/paste.pl
-- 
2.52.0

Reply via email to