I often need to count characters, so wanted to make ruler helper like:
$ ruler() { yes 123456789 | head -n100 | src/paste -s -d¹²³⁴⁵⁶⁷⁸⁹⁰ | cut
-c-$COLUMNS; }
So I could do things like:
$ yes foo | head -n10 | paste -s -d.; ruler
foo.foo.foo.foo.foo.foo.foo.foo.foo.foo
123456789¹123456789²123456789³123456789⁴123456789⁵123456789⁶123456789⁷
But paste(1) needs multi-byte support for that,
which the attached implements.
cheers,
PadraigFrom c40882965cd2620a57dd34638d4182a1373a6a58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Mon, 12 Jan 2026 23:41:29 +0000
Subject: [PATCH] paste: support multi-byte delimiters
* src/paste.c (collapse_escapes): This is the central --delimiters
parsing function, so adjust to handle multi-byte chars with
mcel_scanz(). Populate a delimiters length array to support
characters of differing lengths.
(paste_serial): Use the delimiters length array to output
the appropriate delimiter.
(paste_parallel): Likewise.
* tests/paste/multi-byte.sh: A new test.
* tests/local.mk: Reference the new test.
* NEWS: Mention the improvement.
---
NEWS | 2 +
src/paste.c | 177 +++++++++++++++++++--------------
tests/local.mk | 3 +-
tests/paste/multi-byte.sh | 105 +++++++++++++++++++
tests/{misc => paste}/paste.pl | 0
5 files changed, 213 insertions(+), 74 deletions(-)
create mode 100755 tests/paste/multi-byte.sh
rename tests/{misc => paste}/paste.pl (100%)
diff --git a/NEWS b/NEWS
index 1ccc52426..b4031caa0 100644
--- a/NEWS
+++ b/NEWS
@@ -81,6 +81,8 @@ GNU coreutils NEWS -*- outline -*-
'du' now processes directories with 10,000 or more entries up to 9 times
faster on the Lustre file system.
+ 'paste' now supports multi-byte --delimiters characters.
+
'pinky' will now exit immediately upon receiving a write error, which is
significant when reading large plan or project files.
diff --git a/src/paste.c b/src/paste.c
index f48f57f6b..01ed596e2 100644
--- a/src/paste.c
+++ b/src/paste.c
@@ -42,6 +42,7 @@
#include <sys/types.h>
#include "system.h"
#include "fadvise.h"
+#include "mcel.h"
/* The official name of this program (e.g., no 'g' prefix). */
#define PROGRAM_NAME "paste"
@@ -50,9 +51,6 @@
proper_name ("David M. Ihnat"), \
proper_name ("David MacKenzie")
-/* Indicates that no delimiter should be added in the current position. */
-#define EMPTY_DELIM '\0'
-
/* If nonzero, we have read standard input at some point. */
static bool have_read_stdin;
@@ -60,11 +58,16 @@ static bool have_read_stdin;
corresponding lines from each file in parallel. */
static bool serial_merge;
-/* The delimiters between lines of input files (used cyclically). */
+/* The delimiters between lines of input files (used cyclically).
+ This stores the raw bytes of all delimiters concatenated. */
static char *delims;
-/* A pointer to the character after the end of 'delims'. */
-static char const *delim_end;
+/* Length of each delimiter in bytes (supports multi-byte characters).
+ A length of 0 indicates no delimiter at this position (from \0 escape). */
+static size_t *delim_lens;
+
+/* Number of delimiters. */
+static idx_t num_delims;
static unsigned char line_delim = '\n';
@@ -78,10 +81,10 @@ static struct option const longopts[] =
{nullptr, 0, nullptr, 0}
};
-/* Set globals delims and delim_end. Copy STRPTR to DELIMS, converting
- backslash representations of special characters in STRPTR to their actual
- values. The set of possible backslash characters has been expanded beyond
- that recognized by the Unix version.
+/* Set globals delims, delim_lens, and num_delims.
+ Process STRPTR converting backslash representations of special characters
+ to their actual values. The set of possible backslash characters has been
+ expanded beyond that recognized by the Unix version.
Return 0 upon success.
If the string ends in an odd number of backslashes, ignore the
final backslash and return nonzero. */
@@ -93,62 +96,65 @@ collapse_escapes (char const *strptr)
bool backslash_at_end = false;
delims = strout;
+ delim_lens = xnmalloc (MAX (1, strlen (strptr)), sizeof *delim_lens);
+
+ char const *s = strptr;
+ idx_t idx = 0;
- while (*strptr)
+ while (*s)
{
- if (*strptr != '\\') /* Is it an escape character? */
- *strout++ = *strptr++; /* No, just transfer it. */
- else
+ if (*s == '\\')
{
- switch (*++strptr)
+ s++;
+ if (*s == '\0')
{
- case '0':
- *strout++ = EMPTY_DELIM;
- break;
-
- case 'b':
- *strout++ = '\b';
- break;
-
- case 'f':
- *strout++ = '\f';
- break;
-
- case 'n':
- *strout++ = '\n';
- break;
-
- case 'r':
- *strout++ = '\r';
+ backslash_at_end = true;
break;
+ }
+ else if (*s == '0')
+ {
+ /* Empty delimiter at this position. */
+ s++;
+ delim_lens[idx++] = 0;
+ }
+ else
+ {
+ switch (*s)
+ {
+ case 'b': *strout++ = '\b'; break;
+ case 'f': *strout++ = '\f'; break;
+ case 'n': *strout++ = '\n'; break;
+ case 'r': *strout++ = '\r'; break;
+ case 't': *strout++ = '\t'; break;
+ case 'v': *strout++ = '\v'; break;
+ case '\\': *strout++ = '\\'; break;
+ default: goto copy_character;
+ }
- case 't':
- *strout++ = '\t';
- break;
+ s++;
+ delim_lens[idx++] = 1;
+ }
- case 'v':
- *strout++ = '\v';
- break;
+ continue;
+ }
- case '\\':
- *strout++ = '\\';
- break;
+ copy_character:
+ mcel_t g = mcel_scanz (s);
+ strout = mempcpy (strout, s, g.len);
+ s += g.len;
+ delim_lens[idx++] = g.len;
+ }
- case '\0':
- backslash_at_end = true;
- goto done;
+ *strout = '\0';
- default:
- *strout++ = *strptr;
- break;
- }
- strptr++;
- }
+ if (idx == 0)
+ {
+ delim_lens[0] = 0;
+ idx = 1;
}
- done:
+ num_delims = idx;
- delim_end = strout;
return backslash_at_end ? 1 : 0;
}
@@ -161,6 +167,16 @@ xputchar (char c)
write_error ();
}
+/* Output the delimiter at DELIMPTR with length LEN.
+ If LEN is 0, nothing is output (empty delimiter from \0 escape). */
+
+static inline void
+output_delim (char const *delimptr, size_t len)
+{
+ if (len > 0 && fwrite (delimptr, 1, len, stdout) != len)
+ write_error ();
+}
+
/* Perform column paste on the NFILES files named in FNAMPTR.
Return true if successful, false if one or more files could not be
opened or read. */
@@ -171,9 +187,9 @@ paste_parallel (size_t nfiles, char **fnamptr)
bool ok = true;
/* If all files are just ready to be closed, or will be on this
round, the string of delimiters must be preserved.
- delbuf[0] through delbuf[nfiles]
- store the delimiters for closed files. */
- char *delbuf = xmalloc (nfiles + 2);
+ delbuf stores the delimiter bytes for closed files.
+ Size it to hold up to (nfiles - 1) delimiters. */
+ char *delbuf = xmalloc ((nfiles - 1) * MB_CUR_MAX + 1);
/* Streams open to the files to process; null if the corresponding
stream is closed. */
@@ -218,8 +234,9 @@ paste_parallel (size_t nfiles, char **fnamptr)
{
/* Set up for the next line. */
bool somedone = false;
- char const *delimptr = delims;
- size_t delims_saved = 0; /* Number of delims saved in 'delbuf'. */
+ idx_t delimidx = 0; /* Current delimiter index. */
+ idx_t delimoff = 0; /* Current offset into delims. */
+ idx_t delims_saved = 0; /* Bytes saved in 'delbuf'. */
for (size_t i = 0; i < nfiles && files_open; i++)
{
@@ -292,10 +309,18 @@ paste_parallel (size_t nfiles, char **fnamptr)
else
{
/* Closed file; add delimiter to 'delbuf'. */
- if (*delimptr != EMPTY_DELIM)
- delbuf[delims_saved++] = *delimptr;
- if (++delimptr == delim_end)
- delimptr = delims;
+ size_t len = delim_lens[delimidx];
+ if (len > 0)
+ {
+ memcpy (delbuf + delims_saved, delims + delimoff, len);
+ delims_saved += len;
+ }
+ delimoff += len;
+ if (++delimidx == num_delims)
+ {
+ delimidx = 0;
+ delimoff = 0;
+ }
}
}
else
@@ -308,10 +333,13 @@ paste_parallel (size_t nfiles, char **fnamptr)
{
if (chr != line_delim && chr != EOF)
xputchar (chr);
- if (*delimptr != EMPTY_DELIM)
- xputchar (*delimptr);
- if (++delimptr == delim_end)
- delimptr = delims;
+ output_delim (delims + delimoff, delim_lens[delimidx]);
+ delimoff += delim_lens[delimidx];
+ if (++delimidx == num_delims)
+ {
+ delimidx = 0;
+ delimoff = 0;
+ }
}
else
{
@@ -337,7 +365,6 @@ paste_serial (size_t nfiles, char **fnamptr)
{
bool ok = true; /* false if open or read errors occur. */
int charnew, charold; /* Current and previous char read. */
- char const *delimptr; /* Current delimiter char. */
FILE *fileptr; /* Open for reading current file. */
for (; nfiles; nfiles--, fnamptr++)
@@ -361,7 +388,8 @@ paste_serial (size_t nfiles, char **fnamptr)
fadvise (fileptr, FADVISE_SEQUENTIAL);
}
- delimptr = delims; /* Set up for delimiter string. */
+ idx_t delimidx = 0; /* Current delimiter index. */
+ idx_t delimoff = 0; /* Current offset into delims. */
charold = getc (fileptr);
saved_errno = errno;
@@ -378,11 +406,13 @@ paste_serial (size_t nfiles, char **fnamptr)
/* Process the old character. */
if (charold == line_delim)
{
- if (*delimptr != EMPTY_DELIM)
- xputchar (*delimptr);
-
- if (++delimptr == delim_end)
- delimptr = delims;
+ output_delim (delims + delimoff, delim_lens[delimidx]);
+ delimoff += delim_lens[delimidx];
+ if (++delimidx == num_delims)
+ {
+ delimidx = 0;
+ delimoff = 0;
+ }
}
else
xputchar (charold);
@@ -520,6 +550,7 @@ main (int argc, char **argv)
(nfiles, &argv[optind]));
free (delims);
+ free (delim_lens);
if (have_read_stdin && fclose (stdin) == EOF)
error (EXIT_FAILURE, errno, "-");
diff --git a/tests/local.mk b/tests/local.mk
index 01db53678..4bae33c6e 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -377,9 +377,10 @@ all_tests = \
tests/od/od-j.sh \
tests/od/od-multiple-t.sh \
tests/od/od-x8.sh \
- tests/misc/paste.pl \
tests/misc/pathchk.sh \
tests/misc/printenv.sh \
+ tests/paste/paste.pl \
+ tests/paste/multi-byte.sh \
tests/printf/printf.sh \
tests/printf/printf-cov.pl \
tests/printf/printf-hex.sh \
diff --git a/tests/paste/multi-byte.sh b/tests/paste/multi-byte.sh
new file mode 100755
index 000000000..e22f9e3f6
--- /dev/null
+++ b/tests/paste/multi-byte.sh
@@ -0,0 +1,105 @@
+#!/bin/sh
+# Test multi-byte delimiter handling in paste
+
+# Copyright (C) 2026 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ paste printf
+
+# Test UTF-8 multi-byte delimiters
+export LC_ALL=en_US.UTF-8
+
+# Skip if UTF-8 is not supported
+test "$(locale charmap 2>/dev/null)" = UTF-8 ||
+ skip_ 'UTF-8 locale not available'
+
+# UTF-8 test: 2-byte character (e.g., cent sign)
+delim_cent=$(env printf '\xc2\xa2')
+# UTF-8 test: 3-byte character (e.g., euro sign)
+delim_euro=$(env printf '\xe2\x82\xac')
+# UTF-8 test: 4-byte character (e.g., emoji: U+1F600)
+delim_emoji=$(env printf '\xf0\x9f\x98\x80')
+
+printf '1\n2\n' > f1 || framework_failure_
+printf 'a\nb\n' > f2 || framework_failure_
+
+# Test parallel mode with multi-byte delimiters
+for delim in "$delim_cent" "$delim_euro" "$delim_emoji"; do
+ paste -d "$delim" f1 f2 > out || fail=1
+ printf "1${delim}a\n2${delim}b\n" > exp || framework_failure_
+ compare exp out || fail=1
+done
+
+# Test serial mode with multi-byte delimiters
+printf '1\n2\n3\n' > f3 || framework_failure_
+for delim in "$delim_cent" "$delim_euro"; do
+ paste -s -d "$delim" f3 > out || fail=1
+ printf "1${delim}2${delim}3\n" > exp || framework_failure_
+ compare exp out || fail=1
+done
+
+# Test multiple multi-byte delimiters cycling
+printf 'a\nb\nc\n' > f4 || framework_failure_
+printf '1\n2\n3\n' > f5 || framework_failure_
+printf 'x\ny\nz\n' > f6 || framework_failure_
+paste -d "${delim_cent}${delim_euro}" f4 f5 f6 > out || fail=1
+printf "a${delim_cent}1${delim_euro}x\n" > exp || framework_failure_
+printf "b${delim_cent}2${delim_euro}y\n" >> exp || framework_failure_
+printf "c${delim_cent}3${delim_euro}z\n" >> exp || framework_failure_
+compare exp out || fail=1
+
+# Test multi-byte delimiters mixed with empty delimiter (\0)
+paste -s -d "${delim_euro}\\0" f3 > out || fail=1
+printf "1${delim_euro}23\n" > exp || framework_failure_
+compare exp out || fail=1
+
+# Test invalid UTF-8 sequences are still passed through
+delims_invalid=$(bad_unicode)
+delim_invalid=$(env printf '%s' "$delims_invalid" | cut -b1)
+paste -d "$delims_invalid" f1 f2 > out || fail=1
+printf "1${delim_invalid}a\n2${delim_invalid}b\n" > exp || framework_failure_
+compare exp out || fail=1
+
+# Test that \<multi-byte char> is treated like <multi-byte char>
+# (unknown escapes pass through the escaped character)
+paste -d "\\${delim_euro}" f1 f2 > out || fail=1
+paste -d "$delim_euro" f1 f2 > exp || fail=1
+compare exp out || fail=1
+
+
+# Test GB18030 encoding if available
+export LC_ALL=zh_CN.gb18030
+
+if test "$(locale charmap 2>/dev/null | sed 's/gb/GB/')" = GB18030; then
+ # GB18030 2-byte character (e.g., 0xA2 0xE3 is a valid GB18030 char)
+ delim_gb18030=$(env printf '\xa2\xe3')
+
+ paste -d "$delim_gb18030" f1 f2 > out || fail=1
+ printf "1${delim_gb18030}a\n2${delim_gb18030}b\n" > exp || framework_failure_
+ compare exp out || fail=1
+
+ paste -s -d "$delim_gb18030" f3 > out || fail=1
+ printf "1${delim_gb18030}2${delim_gb18030}3\n" > exp || framework_failure_
+ compare exp out || fail=1
+
+ # Note 0xFF is invalid in GB18030, but we support all single byte delimiters
+ delim_ff=$(env printf '\xff')
+ paste -d "$delim_ff" f1 f2 > out || fail=1
+ printf "1${delim_ff}a\n2${delim_ff}b\n" > exp || framework_failure_
+ compare exp out || fail=1
+fi
+
+Exit $fail
diff --git a/tests/misc/paste.pl b/tests/paste/paste.pl
similarity index 100%
rename from tests/misc/paste.pl
rename to tests/paste/paste.pl
--
2.52.0