[PATCH] nl: support multibyte section delimiters

Collin Funk Sat, 07 Feb 2026 11:17:30 -0800

I wrote this patch last night after noticing that only 'nl -d' needs
changing for multibyte character support. Or am I missing something?


-- 8< --

* src/nl.c: Include mcel.h.
(DEFAULT_SECTION_DELIMITERS): Resize to fit 2 multibyte characters.
(section_del_len): New variable.
(check_section): Compare against section_del_len instead of 2.
(main): Support multibyte characters for the -d option.
* tests/nl/multibyte.sh: New file.
* tests/nl/nl.sh: New file, moved from tests/misc/nl.sh.
* tests/local.mk (all_tests): Add the new test. Adjust the existing
tests file name.
* cfg.mk (exclude_file_name_regexp--sc_space_tab): Adjust Adjust the
existing tests file name.
---
 cfg.mk                   |  2 +-
 src/nl.c                 | 31 ++++++++++++++++------
 tests/local.mk           |  3 ++-
 tests/nl/multibyte.sh    | 56 ++++++++++++++++++++++++++++++++++++++++
 tests/{misc => nl}/nl.sh |  0
 5 files changed, 82 insertions(+), 10 deletions(-)
 create mode 100755 tests/nl/multibyte.sh
 rename tests/{misc => nl}/nl.sh (100%)

diff --git a/cfg.mk b/cfg.mk
index b394f6698..27b63f93b 100644
--- a/cfg.mk
+++ b/cfg.mk
@@ -905,7 +905,7 @@ update-copyright-env = \
 
 # List syntax-check exemptions.
 exclude_file_name_regexp--sc_space_tab = \
-  ^(tests/pr/|tests/misc/nl\.sh$$|gl/.*\.diff$$|man/help2man$$)
+  ^(tests/pr/|tests/nl/nl\.sh$$|gl/.*\.diff$$|man/help2man$$)
 exclude_file_name_regexp--sc_bindtextdomain = \
   ^(gl/.*|lib/euidaccess-stat|src/make-prime-list|src/cksum_crc)\.c$$
 exclude_file_name_regexp--sc_trailing_blank = \
diff --git a/src/nl.c b/src/nl.c
index 805de3491..8a423a11b 100644
--- a/src/nl.c
+++ b/src/nl.c
@@ -29,6 +29,7 @@
 
 #include "fadvise.h"
 #include "linebuffer.h"
+#include "mcel.h"
 #include "quote.h"
 #include "xdectoint.h"
 
@@ -52,7 +53,7 @@ static char const FORMAT_RIGHT_LZ[] = "%0*jd%s";
 static char const FORMAT_LEFT[] = "%-*jd%s";
 
 /* Default section delimiter characters.  */
-static char DEFAULT_SECTION_DELIMITERS[] = "\\:";
+static char DEFAULT_SECTION_DELIMITERS[MCEL_LEN_MAX * 2 + 1] = "\\:";
 
 /* Types of input lines: either one of the section delimiters,
    or text to output. */
@@ -96,6 +97,9 @@ static char const *separator_str = "\t";
 /* Input section delimiter string (-d).  */
 static char *section_del = DEFAULT_SECTION_DELIMITERS;
 
+/* Input section delimiter length.  */
+static size_t section_del_len;
+
 /* Header delimiter string.  */
 static char *header_del = NULL;
 
@@ -405,7 +409,7 @@ check_section (void)
   size_t len = line_buf.length - 1;
 
   if (len < 2 || footer_del_len < 2
-      || !memeq (line_buf.buffer, section_del, 2))
+      || !memeq (line_buf.buffer, section_del, section_del_len))
     return Text;
   if (len == header_del_len
       && memeq (line_buf.buffer, header_del, header_del_len))
@@ -578,14 +582,25 @@ main (int argc, char **argv)
           break;
         case 'd':
           len = strlen (optarg);
-          if (len == 1 || len == 2)  /* POSIX.  */
+          if (1 < MB_CUR_MAX)
             {
-              char *p = section_del;
-              while (*optarg)
-                *p++ = *optarg++;
+              char const *p = optarg;
+              char const *lim = p + len;
+              int n_chars = 0;
+              for (; p < lim && n_chars < 2; ++n_chars)
+                  p += mcel_scan (p, lim).len;
+              if (n_chars == 1)
+                memcpy (mempcpy (section_del, optarg, len),  ":", sizeof ":");
+              else
+                section_del = optarg;
             }
           else
-            section_del = optarg;  /* GNU extension.  */
+            {
+              if (len == 1)
+                *section_del = *optarg;
+              else
+                section_del = optarg;
+            }
           break;
         case_GETOPT_HELP_CHAR;
         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
@@ -599,7 +614,7 @@ main (int argc, char **argv)
     usage (EXIT_FAILURE);
 
   /* Initialize the section delimiters.  */
-  len = strlen (section_del);
+  section_del_len = len = strlen (section_del);
 
   header_del_len = len * 3;
   header_del = xmalloc (header_del_len + 1);
diff --git a/tests/local.mk b/tests/local.mk
index 60e651b94..06b86b779 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -367,7 +367,8 @@ all_tests =                                 \
   tests/misc/mknod.sh                          \
   tests/nice/nice.sh                           \
   tests/nice/nice-fail.sh                      \
-  tests/misc/nl.sh                             \
+  tests/nl/nl.sh                               \
+  tests/nl/multibyte.sh                                \
   tests/misc/nohup.sh                          \
   tests/nproc/nproc-avail.sh                   \
   tests/nproc/nproc-positive.sh                        \
diff --git a/tests/nl/multibyte.sh b/tests/nl/multibyte.sh
new file mode 100755
index 000000000..e1eb40467
--- /dev/null
+++ b/tests/nl/multibyte.sh
@@ -0,0 +1,56 @@
+#!/bin/sh
+# Test nl with multibyte section delimiters.
+
+# Copyright (C) 2026 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ nl printf
+
+test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available"
+
+cat <<\EOF > exp || framework_failure_
+
+     1 a
+
+     2 b
+
+     3 c
+EOF
+
+test_nl_multibyte ()
+{
+  {
+    export LC_ALL="$LOCALE_FR_UTF8"
+    # A missing second character implies ':'.
+    env printf "$2$2$2\na\n$2$2\nb\n$2\nc\n" > inp || framework_failure_
+    nl -p -ha -fa -d $(env printf "$1") < inp > out || fail=1
+  }
+  compare exp out
+}
+
+# Implied ':' character.
+test_nl_multibyte '\xc3' '\xc3:' || fail=1
+test_nl_multibyte '\uB250' '\uB250:' || fail=1
+
+# Two characters.
+test_nl_multibyte '\xc3\xc3' '\xc3\xc3' || fail=1
+test_nl_multibyte '\uB250\uB250' '\uB250\uB250' || fail=1
+
+# More than 2 characters is a GNU extension.
+test_nl_multibyte '\uB250\uB250\uB250' '\uB250\uB250\uB250' || fail=1
+test_nl_multibyte "$(bad_unicode)" "$(bad_unicode)" || fail=1
+
+Exit $fail
diff --git a/tests/misc/nl.sh b/tests/nl/nl.sh
similarity index 100%
rename from tests/misc/nl.sh
rename to tests/nl/nl.sh
-- 
2.53.0

[PATCH] nl: support multibyte section delimiters

Reply via email to