better i18n for join, uniq, etc.

Paul Eggert Mon, 30 Oct 2023 01:49:33 -0700

I installed the attached patches to GNU Coreutils so that join and uniqsupport multi-byte characters better out-of-the-box. This uses Gnulib'snew mcel module which makes for simpler multi-byte processing thanwhat's in Fedora's i18n patches for Coreutils. (I also hope it's faster,though I haven't tested this.)

The idea is to continue this process of using mcel for the otherprograms where vanilla Coreutils doesn't conform to POSIX in multi-bytelocales.

The key patch is 0009. Patch 0010 brings in the Fedora tests for joinand uniq in multi-byte locales; these tests pass for me.

Some work is still needed for ignoring case in join and uniq. As Iunderstand it, the Fedora patches don't support 'uniq --ignore-case' inmulti-byte locales. They do support 'join --ignore-case', though theyignore it in the simple-minded way that GNU diff does (except difflowercases first whereas Fedora join uppercases first; although neitherapproach is perfect isn't lowercasing better?).

Comments welcome. If the idea isn't a good one we can back out thepatches. But I hope this can move forward.

From 0292a5678a19cb3f3908cf3b267aa1f18b479aac Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Fri, 27 Oct 2023 08:45:50 -0700
Subject: [PATCH 01/11] maint: prefer c_isxdigit when that is the intent

* src/digest.c (valid_digits, split_3):
* src/echo.c (main):
* src/printf.c (print_esc):
* src/ptx.c (unescape_string):
* src/stat.c (print_it):
When the code is supposed to support only POSIX-locale hex digits,
use c_isxdigit rather than isxdigit.  Include c-ctype.h as needed.
This defends against oddball locales where isxdigit != c_isxdigit.
---
 src/digest.c | 5 +++--
 src/echo.c   | 5 +++--
 src/printf.c | 5 +++--
 src/ptx.c    | 3 ++-
 src/stat.c   | 5 +++--
 5 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/digest.c b/src/digest.c
index b996dde11..1f3695308 100644
--- a/src/digest.c
+++ b/src/digest.c
@@ -23,6 +23,7 @@
 
 #include "system.h"
 #include "argmatch.h"
+#include "c-ctype.h"
 #include "quote.h"
 #include "xdectoint.h"
 #include "xstrtol.h"
@@ -660,7 +661,7 @@ valid_digits (unsigned char const *s, size_t len)
     {
       for (idx_t i = 0; i < digest_hex_bytes; i++)
         {
-          if (!isxdigit (*s))
+          if (!c_isxdigit (*s))
             return false;
           ++s;
         }
@@ -856,7 +857,7 @@ split_3 (char *s, size_t s_len,
 # endif
   unsigned char const *hp = *digest;
   digest_hex_bytes = 0;
-  while (isxdigit (*hp++))
+  while (c_isxdigit (*hp++))
     digest_hex_bytes++;
   if (digest_hex_bytes < 2 || digest_hex_bytes % 2
       || BLAKE2B_MAX_LEN * 2 < digest_hex_bytes)
diff --git a/src/echo.c b/src/echo.c
index 278778ec6..f80ead86f 100644
--- a/src/echo.c
+++ b/src/echo.c
@@ -19,6 +19,7 @@
 #include <sys/types.h>
 #include "system.h"
 #include "assure.h"
+#include "c-ctype.h"
 
 /* The official name of this program (e.g., no 'g' prefix).  */
 #define PROGRAM_NAME "echo"
@@ -219,12 +220,12 @@ just_echo:
                     case 'x':
                       {
                         unsigned char ch = *s;
-                        if (! isxdigit (ch))
+                        if (! c_isxdigit (ch))
                           goto not_an_escape;
                         s++;
                         c = hextobin (ch);
                         ch = *s;
-                        if (isxdigit (ch))
+                        if (c_isxdigit (ch))
                           {
                             s++;
                             c = c * 16 + hextobin (ch);
diff --git a/src/printf.c b/src/printf.c
index f36b45519..ebe09ba76 100644
--- a/src/printf.c
+++ b/src/printf.c
@@ -56,6 +56,7 @@
 #include <wchar.h>
 
 #include "system.h"
+#include "c-ctype.h"
 #include "cl-strtod.h"
 #include "quote.h"
 #include "unicodeio.h"
@@ -262,7 +263,7 @@ print_esc (char const *escstart, bool octal_0)
     {
       /* A hexadecimal \xhh escape sequence must have 1 or 2 hex. digits.  */
       for (esc_length = 0, ++p;
-           esc_length < 2 && isxdigit (to_uchar (*p));
+           esc_length < 2 && c_isxdigit (to_uchar (*p));
            ++esc_length, ++p)
         esc_value = esc_value * 16 + hextobin (*p);
       if (esc_length == 0)
@@ -292,7 +293,7 @@ print_esc (char const *escstart, bool octal_0)
            esc_length > 0;
            --esc_length, ++p)
         {
-          if (! isxdigit (to_uchar (*p)))
+          if (! c_isxdigit (to_uchar (*p)))
             error (EXIT_FAILURE, 0, _("missing hexadecimal number in escape"));
           uni_value = uni_value * 16 + hextobin (*p);
         }
diff --git a/src/ptx.c b/src/ptx.c
index 3601875ed..3cd84b2e9 100644
--- a/src/ptx.c
+++ b/src/ptx.c
@@ -24,6 +24,7 @@
 #include "system.h"
 #include <regex.h>
 #include "argmatch.h"
+#include "c-ctype.h"
 #include "fadvise.h"
 #include "quote.h"
 #include "read-file.h"
@@ -308,7 +309,7 @@ unescape_string (char *string)
             case 'x':		/* \xhhh escape, 3 chars maximum */
               value = 0;
               for (length = 0, string++;
-                   length < 3 && isxdigit (to_uchar (*string));
+                   length < 3 && c_isxdigit (to_uchar (*string));
                    length++, string++)
                 value = value * 16 + HEXTOBIN (*string);
               if (length == 0)
diff --git a/src/stat.c b/src/stat.c
index 39acfee70..522e922ed 100644
--- a/src/stat.c
+++ b/src/stat.c
@@ -58,6 +58,7 @@
 
 #include "areadlink.h"
 #include "argmatch.h"
+#include "c-ctype.h"
 #include "file-type.h"
 #include "filemode.h"
 #include "fs.h"
@@ -1215,13 +1216,13 @@ print_it (char const *format, int fd, char const *filename,
               putchar (esc_value);
               --b;
             }
-          else if (*b == 'x' && isxdigit (to_uchar (b[1])))
+          else if (*b == 'x' && c_isxdigit (to_uchar (b[1])))
             {
               int esc_value = hextobin (b[1]);	/* Value of \xhh escape. */
               /* A hexadecimal \xhh escape sequence must have
                  1 or 2 hex. digits.  */
               ++b;
-              if (isxdigit (to_uchar (b[1])))
+              if (c_isxdigit (to_uchar (b[1])))
                 {
                   ++b;
                   esc_value = esc_value * 16 + hextobin (*b);
-- 
2.39.2

From 2f3d9524bb4d803e5adcf91f8cb2f068fe912c44 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Fri, 27 Oct 2023 08:56:39 -0700
Subject: [PATCH 02/11] digest: omit unnecessary b2sum includes

* src/blake2/b2sum.c: Do not include string.h, errno.h,
ctype.h, unistd.h, getopt.h.
---
 src/blake2/b2sum.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/blake2/b2sum.c b/src/blake2/b2sum.c
index 1a7e99f0e..5d69ff8d4 100644
--- a/src/blake2/b2sum.c
+++ b/src/blake2/b2sum.c
@@ -19,12 +19,6 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-
-#include <ctype.h>
-#include <unistd.h>
-#include <getopt.h>
 
 #include "blake2.h"
 
@@ -133,6 +127,11 @@ cleanup_buffer:
 
 #if 0
 
+#include <errno.h>
+#include <getopt.h>
+#include <string.h>
+#include <unistd.h>
+
 int blake2sp_stream( FILE *stream, void *resstream, size_t outbytes )
 {
   int ret = -1;
-- 
2.39.2

From 684e810ae2de35dd2761bc28149280a249810d5b Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Fri, 27 Oct 2023 17:15:08 -0700
Subject: [PATCH 03/11] maint: move field_sep into separate module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is so that we don’t need to have every source file
include ctype.h.
* bootstrap.conf (gnulib_modules): Add cu-ctype.
* gl/lib/cu-ctype.c, gl/lib/cu-ctype.h, gl/modules/cu-ctype:
New files.
* src/join.c, src/numfmt.c, src/sort.c, src/uniq.c:
Include cu-ctype.h, for field_sep.
* src/system.h (field_sep): Remove; now supplied by cu-ctype.
---
 bootstrap.conf      |  1 +
 gl/lib/cu-ctype.c   |  3 +++
 gl/lib/cu-ctype.h   | 35 +++++++++++++++++++++++++++++++++++
 gl/modules/cu-ctype | 24 ++++++++++++++++++++++++
 src/join.c          |  1 +
 src/numfmt.c        |  1 +
 src/sort.c          |  1 +
 src/system.h        |  7 -------
 src/uniq.c          |  1 +
 9 files changed, 67 insertions(+), 7 deletions(-)
 create mode 100644 gl/lib/cu-ctype.c
 create mode 100644 gl/lib/cu-ctype.h
 create mode 100644 gl/modules/cu-ctype

diff --git a/bootstrap.conf b/bootstrap.conf
index db0c90c67..4724544d7 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -70,6 +70,7 @@ gnulib_modules="
   crypto/sha256
   crypto/sha512
   crypto/sm3
+  cu-ctype
   cycle-check
   d-ino
   d-type
diff --git a/gl/lib/cu-ctype.c b/gl/lib/cu-ctype.c
new file mode 100644
index 000000000..9f753de2e
--- /dev/null
+++ b/gl/lib/cu-ctype.c
@@ -0,0 +1,3 @@
+#include <config.h>
+#define CU_CTYPE_INLINE _GL_EXTERN_INLINE
+#include <cu-ctype.h>
diff --git a/gl/lib/cu-ctype.h b/gl/lib/cu-ctype.h
new file mode 100644
index 000000000..82f1d73f2
--- /dev/null
+++ b/gl/lib/cu-ctype.h
@@ -0,0 +1,35 @@
+/* Character type definitions for coreutils
+
+   Copyright 2023 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include <ctype.h>
+
+#ifndef _GL_INLINE_HEADER_BEGIN
+# error "Please include config.h first."
+#endif
+_GL_INLINE_HEADER_BEGIN
+#ifndef CU_CTYPE_INLINE
+# define CU_CTYPE_INLINE _GL_INLINE
+#endif
+
+/* '\n' is considered a field separator with  --zero-terminated.  */
+CU_CTYPE_INLINE bool
+field_sep (unsigned char ch)
+{
+  return isblank (ch) || ch == '\n';
+}
+
+_GL_INLINE_HEADER_END
diff --git a/gl/modules/cu-ctype b/gl/modules/cu-ctype
new file mode 100644
index 000000000..bd328b32e
--- /dev/null
+++ b/gl/modules/cu-ctype
@@ -0,0 +1,24 @@
+Description:
+ctype.h-like definitions for coreutils
+
+Files:
+lib/cu-ctype.c
+lib/cu-ctype.h
+
+Depends-on:
+ctype
+extern-inline
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += cu-ctype.c
+
+Include:
+"cu-ctype.h"
+
+License:
+GPL
+
+Maintainer:
+all
diff --git a/src/join.c b/src/join.c
index 7eef58c0b..b95cf2b9b 100644
--- a/src/join.c
+++ b/src/join.c
@@ -23,6 +23,7 @@
 
 #include "system.h"
 #include "assure.h"
+#include "cu-ctype.h"
 #include "fadvise.h"
 #include "hard-locale.h"
 #include "linebuffer.h"
diff --git a/src/numfmt.c b/src/numfmt.c
index 8fd6e77ad..2ce70226c 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -24,6 +24,7 @@
 
 #include "argmatch.h"
 #include "c-ctype.h"
+#include "cu-ctype.h"
 #include "mbswidth.h"
 #include "quote.h"
 #include "system.h"
diff --git a/src/sort.c b/src/sort.c
index 5c86b8332..6856e6151 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -31,6 +31,7 @@
 #include "system.h"
 #include "argmatch.h"
 #include "assure.h"
+#include "cu-ctype.h"
 #include "fadvise.h"
 #include "filevercmp.h"
 #include "flexmember.h"
diff --git a/src/system.h b/src/system.h
index 21b15839b..b4e0a7275 100644
--- a/src/system.h
+++ b/src/system.h
@@ -158,13 +158,6 @@ enum
    errors that the cast doesn't.  */
 static inline unsigned char to_uchar (char ch) { return ch; }
 
-/* '\n' is considered a field separator with  --zero-terminated.  */
-static inline bool
-field_sep (unsigned char ch)
-{
-  return isblank (ch) || ch == '\n';
-}
-
 #include <locale.h>
 
 /* Take care of NLS matters.  */
diff --git a/src/uniq.c b/src/uniq.c
index d294ed665..7e177ac5a 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -23,6 +23,7 @@
 
 #include "system.h"
 #include "argmatch.h"
+#include "cu-ctype.h"
 #include "linebuffer.h"
 #include "fadvise.h"
 #include "posixver.h"
-- 
2.39.2

From 4edb14d20f972595fd08f841b94f7454752e2b5f Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Fri, 27 Oct 2023 17:31:49 -0700
Subject: [PATCH 04/11] maint: include ctype.h selectively

Include ctype.h only in files that need it.  Many of its uses
are incorrect, as they assume single-byte locales.  The idea is
to remove the incorrect uses later, when there is time.
* src/chroot.c, src/csplit.c, src/dd.c, src/digest.c, src/dircolors.c:
* src/expand-common.c, src/expand.c, src/fmt.c, src/fold.c, src/ls.c:
* src/od.c, src/pinky.c, src/pr.c, src/ptx.c, src/seq.c:
* src/set-fields.c, src/split.c, src/stdbuf.c, src/test.c:
* src/tr.c, src/truncate.c, src/unexpand.c, src/wc.c:
Include ctype.h.
* src/system.h: Do not include ctype.h.

include ctype.h.o
---
 src/chroot.c        | 1 +
 src/csplit.c        | 1 +
 src/dd.c            | 1 +
 src/digest.c        | 1 +
 src/dircolors.c     | 1 +
 src/expand-common.c | 1 +
 src/expand.c        | 1 +
 src/fmt.c           | 1 +
 src/fold.c          | 1 +
 src/ls.c            | 1 +
 src/od.c            | 1 +
 src/pinky.c         | 1 +
 src/pr.c            | 1 +
 src/ptx.c           | 1 +
 src/seq.c           | 1 +
 src/set-fields.c    | 1 +
 src/split.c         | 1 +
 src/stdbuf.c        | 1 +
 src/system.h        | 2 --
 src/test.c          | 1 +
 src/tr.c            | 1 +
 src/truncate.c      | 1 +
 src/unexpand.c      | 1 +
 src/wc.c            | 1 +
 24 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/chroot.c b/src/chroot.c
index 6150af5cd..17af5ebe4 100644
--- a/src/chroot.c
+++ b/src/chroot.c
@@ -17,6 +17,7 @@
 /* Written by Roland McGrath.  */
 
 #include <config.h>
+#include <ctype.h>
 #include <getopt.h>
 #include <stdio.h>
 #include <sys/types.h>
diff --git a/src/csplit.c b/src/csplit.c
index dca525aaf..32fb96bca 100644
--- a/src/csplit.c
+++ b/src/csplit.c
@@ -19,6 +19,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <getopt.h>
 #include <sys/types.h>
 #include <signal.h>
diff --git a/src/dd.c b/src/dd.c
index 595b8755b..85ea26a3f 100644
--- a/src/dd.c
+++ b/src/dd.c
@@ -18,6 +18,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <sys/types.h>
 #include <signal.h>
 #include <stdckdint.h>
diff --git a/src/digest.c b/src/digest.c
index 1f3695308..336392608 100644
--- a/src/digest.c
+++ b/src/digest.c
@@ -18,6 +18,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <getopt.h>
 #include <sys/types.h>
 
diff --git a/src/dircolors.c b/src/dircolors.c
index 8a86efb76..f9001de07 100644
--- a/src/dircolors.c
+++ b/src/dircolors.c
@@ -17,6 +17,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <sys/types.h>
 #include <fnmatch.h>
 #include <getopt.h>
diff --git a/src/expand-common.c b/src/expand-common.c
index 89fa56ace..16240802d 100644
--- a/src/expand-common.c
+++ b/src/expand-common.c
@@ -16,6 +16,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <stdio.h>
 #include <sys/types.h>
 #include "system.h"
diff --git a/src/expand.c b/src/expand.c
index 0e74d0cf6..00f2119c6 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -34,6 +34,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <stdio.h>
 #include <getopt.h>
 #include <sys/types.h>
diff --git a/src/fmt.c b/src/fmt.c
index ad7a9ce56..b6fe74630 100644
--- a/src/fmt.c
+++ b/src/fmt.c
@@ -17,6 +17,7 @@
 /* Written by Ross Paterson <[email protected]>.  */
 
 #include <config.h>
+#include <ctype.h>
 #include <stdio.h>
 #include <sys/types.h>
 #include <getopt.h>
diff --git a/src/fold.c b/src/fold.c
index 5c0428d80..1a3859097 100644
--- a/src/fold.c
+++ b/src/fold.c
@@ -18,6 +18,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <stdio.h>
 #include <getopt.h>
 #include <sys/types.h>
diff --git a/src/ls.c b/src/ls.c
index 769ae85a7..e16972d3e 100644
--- a/src/ls.c
+++ b/src/ls.c
@@ -36,6 +36,7 @@
    Greg Lee <[email protected]>.  */
 
 #include <config.h>
+#include <ctype.h>
 #include <sys/types.h>
 
 #include <termios.h>
diff --git a/src/od.c b/src/od.c
index 538175af7..951e88652 100644
--- a/src/od.c
+++ b/src/od.c
@@ -18,6 +18,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <stdckdint.h>
 #include <stdio.h>
 #include <getopt.h>
diff --git a/src/pinky.c b/src/pinky.c
index db0d2557b..8c872b2fe 100644
--- a/src/pinky.c
+++ b/src/pinky.c
@@ -17,6 +17,7 @@
 /* Created by hacking who.c by Kaveh Ghazi [email protected] */
 
 #include <config.h>
+#include <ctype.h>
 #include <getopt.h>
 #include <pwd.h>
 #include <stdckdint.h>
diff --git a/src/pr.c b/src/pr.c
index 57361d629..7e680e23c 100644
--- a/src/pr.c
+++ b/src/pr.c
@@ -309,6 +309,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <getopt.h>
 #include <stdckdint.h>
 #include <sys/types.h>
diff --git a/src/ptx.c b/src/ptx.c
index 3cd84b2e9..c1524fed7 100644
--- a/src/ptx.c
+++ b/src/ptx.c
@@ -19,6 +19,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <getopt.h>
 #include <sys/types.h>
 #include "system.h"
diff --git a/src/seq.c b/src/seq.c
index 2822d5c2c..96d14be1c 100644
--- a/src/seq.c
+++ b/src/seq.c
@@ -17,6 +17,7 @@
 /* Written by Ulrich Drepper.  */
 
 #include <config.h>
+#include <ctype.h>
 #include <getopt.h>
 #include <stdio.h>
 #include <sys/types.h>
diff --git a/src/set-fields.c b/src/set-fields.c
index b299280c0..a524b7fa4 100644
--- a/src/set-fields.c
+++ b/src/set-fields.c
@@ -19,6 +19,7 @@
 #include <config.h>
 
 #include "system.h"
+#include <ctype.h>
 #include "quote.h"
 #include "set-fields.h"
 
diff --git a/src/split.c b/src/split.c
index d2cd23234..f56a144a6 100644
--- a/src/split.c
+++ b/src/split.c
@@ -21,6 +21,7 @@
    * support --suppress-matched as in csplit.  */
 #include <config.h>
 
+#include <ctype.h>
 #include <stdckdint.h>
 #include <stdio.h>
 #include <getopt.h>
diff --git a/src/stdbuf.c b/src/stdbuf.c
index 1ec23cf8b..51326ad4e 100644
--- a/src/stdbuf.c
+++ b/src/stdbuf.c
@@ -17,6 +17,7 @@
 /* Written by Pádraig Brady.  */
 
 #include <config.h>
+#include <ctype.h>
 #include <stdio.h>
 #include <getopt.h>
 #include <sys/types.h>
diff --git a/src/system.h b/src/system.h
index b4e0a7275..8c2a4fd8b 100644
--- a/src/system.h
+++ b/src/system.h
@@ -142,8 +142,6 @@ enum
 
 #include "timespec.h"
 
-#include <ctype.h>
-
 /* ISDIGIT differs from isdigit, as follows:
    - Its arg may be any int or unsigned int; it need not be an unsigned char
      or EOF.
diff --git a/src/test.c b/src/test.c
index a4eb40a52..2bcb9abc8 100644
--- a/src/test.c
+++ b/src/test.c
@@ -27,6 +27,7 @@
 #endif
 
 #include <config.h>
+#include <ctype.h>
 #include <stdio.h>
 #include <sys/types.h>
 
diff --git a/src/tr.c b/src/tr.c
index 625c27583..292aae1d4 100644
--- a/src/tr.c
+++ b/src/tr.c
@@ -18,6 +18,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <stdio.h>
 #include <sys/types.h>
 #include <getopt.h>
diff --git a/src/truncate.c b/src/truncate.c
index 4a828e1da..040172c4d 100644
--- a/src/truncate.c
+++ b/src/truncate.c
@@ -21,6 +21,7 @@
    to better fit the "GNU" environment.  */
 
 #include <config.h>             /* sets _FILE_OFFSET_BITS=64 etc. */
+#include <ctype.h>
 #include <stdckdint.h>
 #include <stdio.h>
 #include <getopt.h>
diff --git a/src/unexpand.c b/src/unexpand.c
index 5a2283fdd..46e943365 100644
--- a/src/unexpand.c
+++ b/src/unexpand.c
@@ -35,6 +35,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <stdio.h>
 #include <getopt.h>
 #include <sys/types.h>
diff --git a/src/wc.c b/src/wc.c
index e69ad0d51..43170cf9b 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -19,6 +19,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <stdckdint.h>
 #include <stdio.h>
 #include <getopt.h>
-- 
2.39.2

From 5602342a16e81be25ec00b12af847fc0c72f6589 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sat, 28 Oct 2023 09:07:14 -0700
Subject: [PATCH 05/11] maint: port to oddball tolower

* src/digest.c (hex_equal): Work even in oddball locales
where tolower does not work as expected on ASCII letters.
---
 src/digest.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/digest.c b/src/digest.c
index 336392608..052fa4db7 100644
--- a/src/digest.c
+++ b/src/digest.c
@@ -18,7 +18,6 @@
 
 #include <config.h>
 
-#include <ctype.h>
 #include <getopt.h>
 #include <sys/types.h>
 
@@ -1122,9 +1121,9 @@ hex_equal (unsigned char const *hex_digest, unsigned char const *bin_buffer)
   size_t cnt;
   for (cnt = 0; cnt < digest_bin_bytes; ++cnt)
     {
-      if (tolower (hex_digest[2 * cnt])
+      if (c_tolower (hex_digest[2 * cnt])
           != bin2hex[bin_buffer[cnt] >> 4]
-          || (tolower (hex_digest[2 * cnt + 1])
+          || (c_tolower (hex_digest[2 * cnt + 1])
               != (bin2hex[bin_buffer[cnt] & 0xf])))
         break;
     }
-- 
2.39.2

From 8d60cd8ad69a0c0cd0dcd86e774157bddb41cb79 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sat, 28 Oct 2023 09:22:09 -0700
Subject: [PATCH 06/11] dircolors: assume C-locale spaces

* src/dircolors.c: Include c-ctype.h, not ctype.h.
(parse_line): Use c_isspace, not isspace, as the .dircolors
file format (which does not seem to be documented!) appears
to be ASCII.
---
 src/dircolors.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/dircolors.c b/src/dircolors.c
index f9001de07..75ea51603 100644
--- a/src/dircolors.c
+++ b/src/dircolors.c
@@ -17,13 +17,13 @@
 
 #include <config.h>
 
-#include <ctype.h>
 #include <sys/types.h>
 #include <fnmatch.h>
 #include <getopt.h>
 
 #include "system.h"
 #include "dircolors.h"
+#include "c-ctype.h"
 #include "c-strcase.h"
 #include "obstack.h"
 #include "quote.h"
@@ -153,7 +153,7 @@ parse_line (char const *line, char **keyword, char **arg)
   *keyword = nullptr;
   *arg = nullptr;
 
-  for (p = line; isspace (to_uchar (*p)); ++p)
+  for (p = line; c_isspace (to_uchar (*p)); ++p)
     continue;
 
   /* Ignore blank lines and shell-style comments.  */
@@ -162,7 +162,7 @@ parse_line (char const *line, char **keyword, char **arg)
 
   keyword_start = p;
 
-  while (!isspace (to_uchar (*p)) && *p != '\0')
+  while (!c_isspace (to_uchar (*p)) && *p != '\0')
     {
       ++p;
     }
@@ -175,7 +175,7 @@ parse_line (char const *line, char **keyword, char **arg)
     {
       ++p;
     }
-  while (isspace (to_uchar (*p)));
+  while (c_isspace (to_uchar (*p)));
 
   if (*p == '\0' || *p == '#')
     return;
@@ -185,7 +185,7 @@ parse_line (char const *line, char **keyword, char **arg)
   while (*p != '\0' && *p != '#')
     ++p;
 
-  for (--p; isspace (to_uchar (*p)); --p)
+  for (--p; c_isspace (to_uchar (*p)); --p)
     continue;
   ++p;
 
-- 
2.39.2

From a3ce33c106c3db936deb3bfa9784d6e53f921233 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sat, 28 Oct 2023 09:30:49 -0700
Subject: [PATCH 07/11] stdbuf: port to oddball toupper

* src/stdbuf.c: Do not include ctype.h.
(set_libstdbuf_options): Use c_toupper, not toupper,
since the C locale is intended here.
---
 src/stdbuf.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/stdbuf.c b/src/stdbuf.c
index 51326ad4e..65142fd8c 100644
--- a/src/stdbuf.c
+++ b/src/stdbuf.c
@@ -17,7 +17,6 @@
 /* Written by Pádraig Brady.  */
 
 #include <config.h>
-#include <ctype.h>
 #include <stdio.h>
 #include <getopt.h>
 #include <sys/types.h>
@@ -286,10 +285,10 @@ set_libstdbuf_options (void)
 
           if (*stdbuf[i].optarg == 'L')
             ret = asprintf (&var, "%s%c=L", "_STDBUF_",
-                            toupper (stdbuf[i].optc));
+                            c_toupper (stdbuf[i].optc));
           else
             ret = asprintf (&var, "%s%c=%zu", "_STDBUF_",
-                            toupper (stdbuf[i].optc),
+                            c_toupper (stdbuf[i].optc),
                             stdbuf[i].size);
           if (ret < 0)
             xalloc_die ();
-- 
2.39.2

From 2709bea0f440507ac009e6e7ded453bb792d6842 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sat, 28 Oct 2023 16:15:49 -0700
Subject: [PATCH 08/11] test: allow non-blank white space in numbers

* src/test.c (find_int): Use isspace, not isblank,
for compatibility with how strtol works, which
is how most other shells do this.
---
 src/test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test.c b/src/test.c
index 2bcb9abc8..4f14e4080 100644
--- a/src/test.c
+++ b/src/test.c
@@ -136,7 +136,7 @@ find_int (char const *string)
   char const *p;
   char const *number_start;
 
-  for (p = string; isblank (to_uchar (*p)); p++)
+  for (p = string; isspace (to_uchar (*p)); p++)
     continue;
 
   if (*p == '+')
@@ -154,7 +154,7 @@ find_int (char const *string)
     {
       while (ISDIGIT (*p))
         p++;
-      while (isblank (to_uchar (*p)))
+      while (isspace (to_uchar (*p)))
         p++;
       if (!*p)
         return number_start;
-- 
2.39.2

From 11b01fc21f1dff2685477c03596a0a4009aec7da Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Mon, 30 Oct 2023 00:32:51 -0700
Subject: [PATCH 09/11] join,uniq: support multi-byte separators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* NEWS: Mention this.
* bootstrap.conf (gnulib_modules): Remove cu-ctype, as this module
is now more trouble than it’s worth.  All uses removed.
Add skipchars.
* gl/lib/cu-ctype.c, gl/lib/cu-ctype.h, gl/modules/cu-ctype:
Remove.
* gl/lib/skipchars.c, gl/lib/skipchars.h, gl/modules/skipchars:
* tests/misc/join-utf8.sh:
New files.
* src/join.c: Include skipchars.h and mcel.h instead of cu-ctype.h.
(tab): Now mcel_t, not int.  All uses changed.
(output_separator, output_seplen): New static vars.
(eq_tab, newline_or_blank, comma_or_blank): New functions.
(xfields, prfields, prjoin, add_field_list, main):
Support multi-byte characters.
* src/numfmt.c: Include ctype.h, skipchars.h.
Do not include cu-ctype.h.
(newline_or_blank): New function.
(next_field): Support multi-byte characters.
* src/sort.c: Include ctype.h instead of cu-ctype.h.
(inittables): Open-code field_sep since it no longer exists.
‘sort’ is not multi-byte safe yet, but when it is this code
will need revamping anyway.
* src/uniq.c: Include mcel.h and skipchars.h instead of cu-ctype.h.
(newline_or_blank): New function.
(find_field): Support multi-byte characters.
* tests/local.mk (all_tests): Add tests/misc/join-utf8.sh
---
 NEWS                    |   5 ++
 bootstrap.conf          |   2 +-
 gl/lib/cu-ctype.c       |   3 -
 gl/lib/cu-ctype.h       |  35 ------------
 gl/lib/skipchars.c      |   3 +
 gl/lib/skipchars.h      |  56 +++++++++++++++++++
 gl/modules/cu-ctype     |  24 --------
 gl/modules/skipchars    |  24 ++++++++
 src/join.c              | 119 +++++++++++++++++++++++++---------------
 src/numfmt.c            |  16 ++++--
 src/sort.c              |   6 +-
 src/uniq.c              |  27 +++++----
 tests/local.mk          |   1 +
 tests/misc/join-utf8.sh |  51 +++++++++++++++++
 14 files changed, 244 insertions(+), 128 deletions(-)
 delete mode 100644 gl/lib/cu-ctype.c
 delete mode 100644 gl/lib/cu-ctype.h
 create mode 100644 gl/lib/skipchars.c
 create mode 100644 gl/lib/skipchars.h
 delete mode 100644 gl/modules/cu-ctype
 create mode 100644 gl/modules/skipchars
 create mode 100755 tests/misc/join-utf8.sh

diff --git a/NEWS b/NEWS
index 3021211dc..b1088f683 100644
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,11 @@ GNU coreutils NEWS                                    -*- outline -*-
   to preserve ownership" when copying to GNU/Linux CIFS file systems.
   They do this by working around some Linux CIFS bugs.
 
+  join and uniq now support multi-byte characters better.
+  For example, 'join -tX' now works even if X is a multi-byte character,
+  and both programs now treat multi-byte characters like U+3000
+  IDEOGRAPHIC SPACE as blanks if the current locale treats them so.
+
   numfmt options like --suffix no longer have an arbitrary 127-byte limit.
   [bug introduced with numfmt in coreutils-8.21]
 
diff --git a/bootstrap.conf b/bootstrap.conf
index 4724544d7..97645d6f0 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -70,7 +70,6 @@ gnulib_modules="
   crypto/sha256
   crypto/sha512
   crypto/sm3
-  cu-ctype
   cycle-check
   d-ino
   d-type
@@ -241,6 +240,7 @@ gnulib_modules="
   settime
   sig2str
   sigaction
+  skipchars
   smack
   ssize_t
   stat-macros
diff --git a/gl/lib/cu-ctype.c b/gl/lib/cu-ctype.c
deleted file mode 100644
index 9f753de2e..000000000
--- a/gl/lib/cu-ctype.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#include <config.h>
-#define CU_CTYPE_INLINE _GL_EXTERN_INLINE
-#include <cu-ctype.h>
diff --git a/gl/lib/cu-ctype.h b/gl/lib/cu-ctype.h
deleted file mode 100644
index 82f1d73f2..000000000
--- a/gl/lib/cu-ctype.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Character type definitions for coreutils
-
-   Copyright 2023 Free Software Foundation, Inc.
-
-   This program is free software: you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation, either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
-
-#include <ctype.h>
-
-#ifndef _GL_INLINE_HEADER_BEGIN
-# error "Please include config.h first."
-#endif
-_GL_INLINE_HEADER_BEGIN
-#ifndef CU_CTYPE_INLINE
-# define CU_CTYPE_INLINE _GL_INLINE
-#endif
-
-/* '\n' is considered a field separator with  --zero-terminated.  */
-CU_CTYPE_INLINE bool
-field_sep (unsigned char ch)
-{
-  return isblank (ch) || ch == '\n';
-}
-
-_GL_INLINE_HEADER_END
diff --git a/gl/lib/skipchars.c b/gl/lib/skipchars.c
new file mode 100644
index 000000000..827c89d45
--- /dev/null
+++ b/gl/lib/skipchars.c
@@ -0,0 +1,3 @@
+#include <config.h>
+#define SKIPCHARS_INLINE _GL_EXTERN_INLINE
+#include <skipchars.h>
diff --git a/gl/lib/skipchars.h b/gl/lib/skipchars.h
new file mode 100644
index 000000000..baa9eaba6
--- /dev/null
+++ b/gl/lib/skipchars.h
@@ -0,0 +1,56 @@
+/* Skipping sequences of characters satisfying a predicate
+
+   Copyright 2023 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include "mcel.h"
+
+_GL_INLINE_HEADER_BEGIN
+#ifndef SKIPCHARS_INLINE
+# define SKIPCHARS_INLINE _GL_INLINE
+#endif
+
+/* Return the address just past the leading sequence of possibly
+   multi-byte characters or encoding errors G in STR that satisfy
+   PREDICATE (G) if OK is true, or that do not satisfy the predicate
+   call if OK is false.  */
+
+SKIPCHARS_INLINE char *
+skip_str_matching (char const *str, bool (*predicate) (mcel_t), bool ok)
+{
+  char const *s = str;
+  for (mcel_t g; *s && predicate (g = mcel_scanz (s)) == ok;
+       s += g.len)
+    continue;
+  return (char *) s;
+}
+
+/* Return the address just past the leading sequence of possibly
+   multi-byte characters or encoding errors G in BUF (which ends at LIM)
+   that satisfy PREDICATE (G) if OK is true, or that do not satisfy
+   the predicate call if OK is false.  */
+
+SKIPCHARS_INLINE char *
+skip_buf_matching (char const *buf, char const *lim,
+                   bool (*predicate) (mcel_t), bool ok)
+{
+  char const *s = buf;
+  for (mcel_t g; s < lim && predicate (g = mcel_scan (s, lim)) == ok;
+       s += g.len)
+    continue;
+  return (char *) s;
+}
+
+_GL_INLINE_HEADER_END
diff --git a/gl/modules/cu-ctype b/gl/modules/cu-ctype
deleted file mode 100644
index bd328b32e..000000000
--- a/gl/modules/cu-ctype
+++ /dev/null
@@ -1,24 +0,0 @@
-Description:
-ctype.h-like definitions for coreutils
-
-Files:
-lib/cu-ctype.c
-lib/cu-ctype.h
-
-Depends-on:
-ctype
-extern-inline
-
-configure.ac:
-
-Makefile.am:
-lib_SOURCES += cu-ctype.c
-
-Include:
-"cu-ctype.h"
-
-License:
-GPL
-
-Maintainer:
-all
diff --git a/gl/modules/skipchars b/gl/modules/skipchars
new file mode 100644
index 000000000..3b25fd6eb
--- /dev/null
+++ b/gl/modules/skipchars
@@ -0,0 +1,24 @@
+Description:
+Skip sequences of multi-byte characters or encoding errors
+
+Files:
+lib/skipchars.c
+lib/skipchars.h
+
+Depends-on:
+extern-inline
+mcel
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += skipchars.c
+
+Include:
+"skipchars.h"
+
+License:
+GPL
+
+Maintainer:
+all
diff --git a/src/join.c b/src/join.c
index b95cf2b9b..b3ad27465 100644
--- a/src/join.c
+++ b/src/join.c
@@ -23,12 +23,13 @@
 
 #include "system.h"
 #include "assure.h"
-#include "cu-ctype.h"
 #include "fadvise.h"
 #include "hard-locale.h"
 #include "linebuffer.h"
+#include "mcel.h"
 #include "memcasecmp.h"
 #include "quote.h"
+#include "skipchars.h"
 #include "stdio--.h"
 #include "xmemcoll.h"
 #include "xstrtol.h"
@@ -135,10 +136,14 @@ static struct outlist outlist_head;
 /* Last element in 'outlist', where a new element can be added.  */
 static struct outlist *outlist_end = &outlist_head;
 
-/* Tab character separating fields.  If negative, fields are separated
-   by any nonempty string of blanks, otherwise by exactly one
-   tab character whose value (when cast to unsigned char) equals TAB.  */
-static int tab = -1;
+/* Tab character (or encoding error) separating fields.  If TAB.len == 0,
+   fields are separated by any nonempty string of blanks, otherwise by
+   exactly one tab character (or encoding error) equal to TAB.  */
+static mcel_t tab;
+
+/* The output separator to use, and its length in bytes.  */
+static char const *output_separator = " ";
+static idx_t output_seplen = 1;
 
 /* If nonzero, check that the input is correctly ordered. */
 static enum
@@ -267,6 +272,18 @@ extract_field (struct line *line, char *field, idx_t len)
   ++(line->nfields);
 }
 
+static bool
+eq_tab (mcel_t g)
+{
+  return mcel_cmp (g, tab) == 0;
+}
+
+static bool
+newline_or_blank (mcel_t g)
+{
+  return g.ch == '\n' || c32isblank (g.ch);
+}
+
 /* Fill in the 'fields' structure in LINE.  */
 
 static void
@@ -278,34 +295,29 @@ xfields (struct line *line)
   if (ptr == lim)
     return;
 
-  if (0 <= tab && tab != '\n')
-    {
-      char *sep;
-      for (; (sep = memchr (ptr, tab, lim - ptr)) != nullptr; ptr = sep + 1)
-        extract_field (line, ptr, sep - ptr);
-    }
-  else if (tab < 0)
+  if (!tab.len)
     {
-      /* Skip leading blanks before the first field.  */
-      while (field_sep (*ptr))
-        if (++ptr == lim)
-          return;
-
-      do
+      while (ptr < lim)
         {
-          char *sep;
-          for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
-            continue;
+          ptr = skip_buf_matching (ptr, lim, newline_or_blank, true);
+          if (!*ptr)
+            break;
+          char *sep = skip_buf_matching (ptr, lim, newline_or_blank, false);
           extract_field (line, ptr, sep - ptr);
-          if (sep == lim)
-            return;
-          for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
-            continue;
+          ptr = sep;
         }
-      while (ptr != lim);
     }
+  else
+    {
+      if (tab.ch != '\n')
+        for (char *sep;
+             ((sep = skip_buf_matching (ptr, lim, eq_tab, false))
+              < lim);
+             ptr = sep + mcel_scan (sep, lim).len)
+          extract_field (line, ptr, sep - ptr);
 
-  extract_field (line, ptr, lim - ptr);
+      extract_field (line, ptr, lim - ptr);
+    }
 }
 
 static void
@@ -568,16 +580,15 @@ prfields (struct line const *line, idx_t join_field, idx_t autocount)
 {
   idx_t i;
   idx_t nfields = autoformat ? autocount : line->nfields;
-  char output_separator = tab < 0 ? ' ' : tab;
 
   for (i = 0; i < join_field && i < nfields; ++i)
     {
-      putchar (output_separator);
+      fwrite (output_separator, 1, output_seplen, stdout);
       prfield (i, line);
     }
   for (i = join_field + 1; i < nfields; ++i)
     {
-      putchar (output_separator);
+      fwrite (output_separator, 1, output_seplen, stdout);
       prfield (i, line);
     }
 }
@@ -588,7 +599,6 @@ static void
 prjoin (struct line const *line1, struct line const *line2)
 {
   const struct outlist *outlist;
-  char output_separator = tab < 0 ? ' ' : tab;
   idx_t field;
   struct line const *line;
 
@@ -622,7 +632,7 @@ prjoin (struct line const *line1, struct line const *line2)
           o = o->next;
           if (o == nullptr)
             break;
-          putchar (output_separator);
+          fwrite (output_separator, 1, output_seplen, stdout);
         }
       putchar (eolchar);
     }
@@ -886,6 +896,12 @@ decode_field_spec (char const *s, int *file_index, idx_t *field_index)
     }
 }
 
+static bool
+comma_or_blank (mcel_t g)
+{
+  return g.ch == ',' || c32isblank (g.ch);
+}
+
 /* Add the comma or blank separated field spec(s) in STR to 'outlist'.  */
 
 static void
@@ -898,14 +914,17 @@ add_field_list (char *str)
       int file_index;
       idx_t field_index;
       char const *spec_item = p;
-
-      p = strpbrk (p, ", \t");
-      if (p)
-        *p++ = '\0';
+      p = skip_str_matching (spec_item, comma_or_blank, false);
+      if (*p)
+        {
+          mcel_t g = mcel_scanz (p);
+          *p = '\0';
+          p += g.len;
+        }
       decode_field_spec (spec_item, &file_index, &field_index);
       add_field (file_index, field_index);
     }
-  while (p);
+  while (*p);
 }
 
 /* Set the join field *VAR to VAL, but report an error if *VAR is set
@@ -1087,20 +1106,30 @@ main (int argc, char **argv)
 
         case 't':
           {
-            unsigned char newtab = optarg[0];
-            if (! newtab)
-              newtab = '\n'; /* '' => process the whole line.  */
-            else if (optarg[1])
+            mcel_t newtab;
+            if (!*optarg)
+              {
+                /* '' => process the whole line.  */
+                newtab = mcel_ch ('\n', 1);
+                /* output_separator does not matter.  */
+              }
+            else if (STREQ (optarg, "\\0"))
+              {
+                newtab = mcel_ch ('\0', 1);
+                output_separator = "";
+              }
+            else
               {
-                if (STREQ (optarg, "\\0"))
-                  newtab = '\0';
-                else
+                newtab = mcel_scanz (optarg);
+                if (optarg[newtab.len])
                   error (EXIT_FAILURE, 0, _("multi-character tab %s"),
                          quote (optarg));
+                output_separator = optarg;
               }
-            if (0 <= tab && tab != newtab)
+            if (tab.len && mcel_cmp (tab, newtab) != 0)
               error (EXIT_FAILURE, 0, _("incompatible tabs"));
             tab = newtab;
+            output_seplen = newtab.len;
           }
           break;
 
diff --git a/src/numfmt.c b/src/numfmt.c
index 2ce70226c..7b53c87e4 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -15,6 +15,7 @@
    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
 
 #include <config.h>
+#include <ctype.h>
 #include <float.h>
 #include <getopt.h>
 #include <stdckdint.h>
@@ -24,9 +25,9 @@
 
 #include "argmatch.h"
 #include "c-ctype.h"
-#include "cu-ctype.h"
 #include "mbswidth.h"
 #include "quote.h"
+#include "skipchars.h"
 #include "system.h"
 #include "xstrtol.h"
 
@@ -1314,6 +1315,12 @@ process_suffixed_number (char *text, long double *result,
   return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS);
 }
 
+static bool
+newline_or_blank (mcel_t g)
+{
+  return g.ch == '\n' || c32isblank (g.ch);
+}
+
 /* Return a pointer to the beginning of the next field in line.
    The line pointer is moved to the end of the next field. */
 static char*
@@ -1334,11 +1341,8 @@ next_field (char **line)
   else
     {
       /* keep any space prefix in the returned field */
-      while (*field_end && field_sep (*field_end))
-        ++field_end;
-
-      while (*field_end && ! field_sep (*field_end))
-        ++field_end;
+      field_end = skip_str_matching (field_end, newline_or_blank, true);
+      field_end = skip_str_matching (field_end, newline_or_blank, false);
     }
 
   *line = field_end;
diff --git a/src/sort.c b/src/sort.c
index 6856e6151..829b17f42 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -22,6 +22,7 @@
 
 #include <config.h>
 
+#include <ctype.h>
 #include <getopt.h>
 #include <pthread.h>
 #include <sys/resource.h>
@@ -31,7 +32,6 @@
 #include "system.h"
 #include "argmatch.h"
 #include "assure.h"
-#include "cu-ctype.h"
 #include "fadvise.h"
 #include "filevercmp.h"
 #include "flexmember.h"
@@ -1293,9 +1293,9 @@ inittables (void)
 
   for (i = 0; i < UCHAR_LIM; ++i)
     {
-      blanks[i] = field_sep (i);
+      blanks[i] = i == '\n' || isblank (i);
+      nondictionary[i] = ! blanks[i] && ! isalnum (i);
       nonprinting[i] = ! isprint (i);
-      nondictionary[i] = ! isalnum (i) && ! field_sep (i);
       fold_toupper[i] = toupper (i);
     }
 
diff --git a/src/uniq.c b/src/uniq.c
index 7e177ac5a..7dc0c999a 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -23,10 +23,11 @@
 
 #include "system.h"
 #include "argmatch.h"
-#include "cu-ctype.h"
 #include "linebuffer.h"
 #include "fadvise.h"
+#include "mcel.h"
 #include "posixver.h"
+#include "skipchars.h"
 #include "stdio--.h"
 #include "xstrtol.h"
 #include "memcasecmp.h"
@@ -248,6 +249,12 @@ size_opt (char const *opt, char const *msgid)
   return MIN (size, SIZE_MAX);
 }
 
+static bool
+newline_or_blank (mcel_t g)
+{
+  return g.ch == '\n' || c32isblank (g.ch);
+}
+
 /* Given a linebuffer LINE,
    return a pointer to the beginning of the line's field to be compared. */
 
@@ -256,21 +263,19 @@ static char *
 find_field (struct linebuffer const *line)
 {
   size_t count;
-  char const *lp = line->buffer;
-  size_t size = line->length - 1;
-  size_t i = 0;
+  char *lp = line->buffer;
+  char const *lim = lp + line->length - 1;
 
-  for (count = 0; count < skip_fields && i < size; count++)
+  for (count = 0; count < skip_fields && lp < lim; count++)
     {
-      while (i < size && field_sep (lp[i]))
-        i++;
-      while (i < size && !field_sep (lp[i]))
-        i++;
+      lp = skip_buf_matching (lp, lim, newline_or_blank, true);
+      lp = skip_buf_matching (lp, lim, newline_or_blank, false);
     }
 
-  i += MIN (skip_chars, size - i);
+  for (size_t s = skip_chars; lp < lim && s; s--)
+    lp += mcel_scan (lp, lim).len;
 
-  return line->buffer + i;
+  return lp;
 }
 
 /* Return false if two strings OLD and NEW match, true if not.
diff --git a/tests/local.mk b/tests/local.mk
index 79fea1f6e..a5fb62d96 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -271,6 +271,7 @@ all_tests =					\
   tests/misc/mktemp.pl				\
   tests/misc/arch.sh				\
   tests/misc/join.pl				\
+  tests/misc/join-utf8.sh			\
   tests/pr/pr-tests.pl				\
   tests/pwd/pwd-option.sh			\
   tests/chcon/chcon-fail.sh			\
diff --git a/tests/misc/join-utf8.sh b/tests/misc/join-utf8.sh
new file mode 100755
index 000000000..b70bff7f9
--- /dev/null
+++ b/tests/misc/join-utf8.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+# Test join in a UTF-8 locale.
+
+# Copyright 2023 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ join
+
+test "${LOCALE_FR_UTF8+set}" = set || skip_ "French UTF-8 locale not available"
+
+LC_ALL=$LOCALE_FR_UTF8
+export LC_ALL
+
+fail=0
+
+vertical_line='|'
+multiplication_sign='×'
+en_dash='–'
+old_Persian_word_divider='𐏐'
+
+for s in \
+    "$vertical_line" \
+    "$multiplication_sign" \
+    "$en_dash" \
+    "$old_Persian_word_divider"
+do
+  printf '0%sA\n1%sa\n2%sb\n4%sc\n' "$s" "$s" "$s" "$s" >a ||
+    framework_failure_
+  printf '0%sB\n1%sd\n3%se\n4%sf\n' "$s" "$s" "$s" "$s" >b ||
+    framework_failure_
+  join -t"$s" -a1 -a2 -eouch -o0,1.2,2.2 a b >out || fail=1
+  printf '0%sA%sB\n1%sa%sd\n2%sb%souch\n3%souch%se\n4%sc%sf\n' \
+         "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" >exp ||
+    framework_failure
+  compare exp out || fail=1
+done
+
+Exit $fail
-- 
2.39.2

From ba5017b65a45bd73ec156629e3796b6f3c33f95c Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Mon, 30 Oct 2023 01:24:28 -0700
Subject: [PATCH 10/11] maint: copy join, uniq tests from Fedora

* tests/misc/join.pl, tests/uniq/uniq.pl:
Copy from Fedora 39.  This adds more multi-byte tests.
---
 tests/misc/join.pl | 50 +++++++++++++++++++++++++++++++++++++++++
 tests/uniq/uniq.pl | 55 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+)

diff --git a/tests/misc/join.pl b/tests/misc/join.pl
index 2ca8567ba..1d01a3d3d 100755
--- a/tests/misc/join.pl
+++ b/tests/misc/join.pl
@@ -25,6 +25,15 @@ my $limits = getlimits ();
 
 my $prog = 'join';
 
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
+my $mb_locale;
+#Comment out next line to disable multibyte tests
+$mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+  and $mb_locale = 'C';
+
 my $delim = chr 0247;
 sub t_subst ($)
 {
@@ -333,8 +342,49 @@ foreach my $t (@tv)
     push @Tests, $new_ent;
   }
 
+# Add _POSIX2_VERSION=199209 to the environment of each test
+# that uses an old-style option like +1.
+if ($mb_locale ne 'C')
+  {
+    # Duplicate each test vector, appending "-mb" to the test name and
+    # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+    # provide coverage for the distro-added multi-byte code paths.
+    my @new;
+    foreach my $t (@Tests)
+      {
+        my @new_t = @$t;
+        my $test_name = shift @new_t;
+
+        # Depending on whether join is multi-byte-patched,
+        # it emits different diagnostics:
+        #   non-MB: invalid byte or field list
+        #   MB:     invalid byte, character or field list
+        # Adjust the expected error output accordingly.
+        if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+            (@new_t))
+          {
+            my $sub = {ERR_SUBST => 's/, character//'};
+            push @new_t, $sub;
+            push @$t, $sub;
+          }
+        #Adjust the output some error messages including test_name for mb
+        if (grep {ref $_ eq 'HASH' && exists $_->{ERR}}
+             (@new_t))
+          {
+            my $sub2 = {ERR_SUBST => "s/$test_name-mb/$test_name/"};
+            push @new_t, $sub2;
+            push @$t, $sub2;
+          }
+        push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+      }
+    push @Tests, @new;
+  }
+
 @Tests = triple_test \@Tests;
 
+#skip invalid-j-mb test, it is failing because of the format
+@Tests = grep {$_->[0] ne 'invalid-j-mb'} @Tests;
+
 my $save_temps = $ENV{DEBUG};
 my $verbose = $ENV{VERBOSE};
 
diff --git a/tests/uniq/uniq.pl b/tests/uniq/uniq.pl
index a6354dc3c..e43cd6e3f 100755
--- a/tests/uniq/uniq.pl
+++ b/tests/uniq/uniq.pl
@@ -23,9 +23,17 @@ my $limits = getlimits ();
 my $prog = 'uniq';
 my $try = "Try '$prog --help' for more information.\n";
 
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
 # Turn off localization of executable's output.
 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
 
+my $mb_locale;
+#Comment out next line to disable multibyte tests
+$mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+  and $mb_locale = 'C';
+
 # When possible, create a "-z"-testing variant of each test.
 sub add_z_variants($)
 {
@@ -262,6 +270,53 @@ foreach my $t (@Tests)
       and push @$t, {ENV=>'_POSIX2_VERSION=199209'};
   }
 
+if ($mb_locale ne 'C')
+  {
+    # Duplicate each test vector, appending "-mb" to the test name and
+    # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+    # provide coverage for the distro-added multi-byte code paths.
+    my @new;
+    foreach my $t (@Tests)
+      {
+        my @new_t = @$t;
+        my $test_name = shift @new_t;
+
+        # Depending on whether uniq is multi-byte-patched,
+        # it emits different diagnostics:
+        #   non-MB: invalid byte or field list
+        #   MB:     invalid byte, character or field list
+        # Adjust the expected error output accordingly.
+        if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+            (@new_t))
+          {
+            my $sub = {ERR_SUBST => 's/, character//'};
+            push @new_t, $sub;
+            push @$t, $sub;
+          }
+        # In test #145, replace the each ‘...’ by '...'.
+        if ($test_name =~ "145")
+          {
+            my $sub = { ERR_SUBST => "s/‘([^’]+)’/'\$1'/g"};
+            push @new_t, $sub;
+            push @$t, $sub;
+          }
+        next if (   $test_name =~ "schar"
+                 or $test_name =~ "^obs-plus"
+                 or $test_name =~ "119");
+        push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+      }
+    push @Tests, @new;
+   }
+
+# Remember that triple_test creates from each test with exactly one "IN"
+# file two more tests (.p and .r suffix on name) corresponding to reading
+# input from a file and from a pipe.  The pipe-reading test would fail
+# due to a race condition about 1 in 20 times.
+# Remove the IN_PIPE version of the "output-is-input" test above.
+# The others aren't susceptible because they have three inputs each.
+
+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
+
 @Tests = add_z_variants \@Tests;
 @Tests = triple_test \@Tests;
 
-- 
2.39.2

From bd45f0963c42ee0d9d31b065d9e60e7435ed0523 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Mon, 30 Oct 2023 01:32:37 -0700
Subject: [PATCH 11/11] =?UTF-8?q?maint:=20pacify=20=E2=80=98make=20syntax-?=
 =?UTF-8?q?check=E2=80=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* tests/misc/join-utf8.sh: Omit fail=0.
Fix framework_failure_ typo.
* tests/misc/join.pl: Change ` to '.
---
 tests/misc/join-utf8.sh | 4 +---
 tests/misc/join.pl      | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/misc/join-utf8.sh b/tests/misc/join-utf8.sh
index b70bff7f9..a2bc3b1e5 100755
--- a/tests/misc/join-utf8.sh
+++ b/tests/misc/join-utf8.sh
@@ -24,8 +24,6 @@ test "${LOCALE_FR_UTF8+set}" = set || skip_ "French UTF-8 locale not available"
 LC_ALL=$LOCALE_FR_UTF8
 export LC_ALL
 
-fail=0
-
 vertical_line='|'
 multiplication_sign='×'
 en_dash='–'
@@ -44,7 +42,7 @@ do
   join -t"$s" -a1 -a2 -eouch -o0,1.2,2.2 a b >out || fail=1
   printf '0%sA%sB\n1%sa%sd\n2%sb%souch\n3%souch%se\n4%sc%sf\n' \
          "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" >exp ||
-    framework_failure
+    framework_failure_
   compare exp out || fail=1
 done
 
diff --git a/tests/misc/join.pl b/tests/misc/join.pl
index 1d01a3d3d..4eafc273d 100755
--- a/tests/misc/join.pl
+++ b/tests/misc/join.pl
@@ -25,7 +25,7 @@ my $limits = getlimits ();
 
 my $prog = 'join';
 
-my $try = "Try \`$prog --help' for more information.\n";
+my $try = "Try '$prog --help' for more information.\n";
 my $inval = "$prog: invalid byte, character or field list\n$try";
 
 my $mb_locale;
-- 
2.39.2

better i18n for join, uniq, etc.

Reply via email to