I installed the attached patches to GNU Coreutils so that join and uniq
support multi-byte characters better out-of-the-box. This uses Gnulib's
new mcel module which makes for simpler multi-byte processing than
what's in Fedora's i18n patches for Coreutils. (I also hope it's faster,
though I haven't tested this.)
The idea is to continue this process of using mcel for the other
programs where vanilla Coreutils doesn't conform to POSIX in multi-byte
locales.
The key patch is 0009. Patch 0010 brings in the Fedora tests for join
and uniq in multi-byte locales; these tests pass for me.
Some work is still needed for ignoring case in join and uniq. As I
understand it, the Fedora patches don't support 'uniq --ignore-case' in
multi-byte locales. They do support 'join --ignore-case', though they
ignore it in the simple-minded way that GNU diff does (except diff
lowercases first whereas Fedora join uppercases first; although neither
approach is perfect isn't lowercasing better?).
Comments welcome. If the idea isn't a good one we can back out the
patches. But I hope this can move forward.From 0292a5678a19cb3f3908cf3b267aa1f18b479aac Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 27 Oct 2023 08:45:50 -0700
Subject: [PATCH 01/11] maint: prefer c_isxdigit when that is the intent
* src/digest.c (valid_digits, split_3):
* src/echo.c (main):
* src/printf.c (print_esc):
* src/ptx.c (unescape_string):
* src/stat.c (print_it):
When the code is supposed to support only POSIX-locale hex digits,
use c_isxdigit rather than isxdigit. Include c-ctype.h as needed.
This defends against oddball locales where isxdigit != c_isxdigit.
---
src/digest.c | 5 +++--
src/echo.c | 5 +++--
src/printf.c | 5 +++--
src/ptx.c | 3 ++-
src/stat.c | 5 +++--
5 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/src/digest.c b/src/digest.c
index b996dde11..1f3695308 100644
--- a/src/digest.c
+++ b/src/digest.c
@@ -23,6 +23,7 @@
#include "system.h"
#include "argmatch.h"
+#include "c-ctype.h"
#include "quote.h"
#include "xdectoint.h"
#include "xstrtol.h"
@@ -660,7 +661,7 @@ valid_digits (unsigned char const *s, size_t len)
{
for (idx_t i = 0; i < digest_hex_bytes; i++)
{
- if (!isxdigit (*s))
+ if (!c_isxdigit (*s))
return false;
++s;
}
@@ -856,7 +857,7 @@ split_3 (char *s, size_t s_len,
# endif
unsigned char const *hp = *digest;
digest_hex_bytes = 0;
- while (isxdigit (*hp++))
+ while (c_isxdigit (*hp++))
digest_hex_bytes++;
if (digest_hex_bytes < 2 || digest_hex_bytes % 2
|| BLAKE2B_MAX_LEN * 2 < digest_hex_bytes)
diff --git a/src/echo.c b/src/echo.c
index 278778ec6..f80ead86f 100644
--- a/src/echo.c
+++ b/src/echo.c
@@ -19,6 +19,7 @@
#include <sys/types.h>
#include "system.h"
#include "assure.h"
+#include "c-ctype.h"
/* The official name of this program (e.g., no 'g' prefix). */
#define PROGRAM_NAME "echo"
@@ -219,12 +220,12 @@ just_echo:
case 'x':
{
unsigned char ch = *s;
- if (! isxdigit (ch))
+ if (! c_isxdigit (ch))
goto not_an_escape;
s++;
c = hextobin (ch);
ch = *s;
- if (isxdigit (ch))
+ if (c_isxdigit (ch))
{
s++;
c = c * 16 + hextobin (ch);
diff --git a/src/printf.c b/src/printf.c
index f36b45519..ebe09ba76 100644
--- a/src/printf.c
+++ b/src/printf.c
@@ -56,6 +56,7 @@
#include <wchar.h>
#include "system.h"
+#include "c-ctype.h"
#include "cl-strtod.h"
#include "quote.h"
#include "unicodeio.h"
@@ -262,7 +263,7 @@ print_esc (char const *escstart, bool octal_0)
{
/* A hexadecimal \xhh escape sequence must have 1 or 2 hex. digits. */
for (esc_length = 0, ++p;
- esc_length < 2 && isxdigit (to_uchar (*p));
+ esc_length < 2 && c_isxdigit (to_uchar (*p));
++esc_length, ++p)
esc_value = esc_value * 16 + hextobin (*p);
if (esc_length == 0)
@@ -292,7 +293,7 @@ print_esc (char const *escstart, bool octal_0)
esc_length > 0;
--esc_length, ++p)
{
- if (! isxdigit (to_uchar (*p)))
+ if (! c_isxdigit (to_uchar (*p)))
error (EXIT_FAILURE, 0, _("missing hexadecimal number in escape"));
uni_value = uni_value * 16 + hextobin (*p);
}
diff --git a/src/ptx.c b/src/ptx.c
index 3601875ed..3cd84b2e9 100644
--- a/src/ptx.c
+++ b/src/ptx.c
@@ -24,6 +24,7 @@
#include "system.h"
#include <regex.h>
#include "argmatch.h"
+#include "c-ctype.h"
#include "fadvise.h"
#include "quote.h"
#include "read-file.h"
@@ -308,7 +309,7 @@ unescape_string (char *string)
case 'x': /* \xhhh escape, 3 chars maximum */
value = 0;
for (length = 0, string++;
- length < 3 && isxdigit (to_uchar (*string));
+ length < 3 && c_isxdigit (to_uchar (*string));
length++, string++)
value = value * 16 + HEXTOBIN (*string);
if (length == 0)
diff --git a/src/stat.c b/src/stat.c
index 39acfee70..522e922ed 100644
--- a/src/stat.c
+++ b/src/stat.c
@@ -58,6 +58,7 @@
#include "areadlink.h"
#include "argmatch.h"
+#include "c-ctype.h"
#include "file-type.h"
#include "filemode.h"
#include "fs.h"
@@ -1215,13 +1216,13 @@ print_it (char const *format, int fd, char const *filename,
putchar (esc_value);
--b;
}
- else if (*b == 'x' && isxdigit (to_uchar (b[1])))
+ else if (*b == 'x' && c_isxdigit (to_uchar (b[1])))
{
int esc_value = hextobin (b[1]); /* Value of \xhh escape. */
/* A hexadecimal \xhh escape sequence must have
1 or 2 hex. digits. */
++b;
- if (isxdigit (to_uchar (b[1])))
+ if (c_isxdigit (to_uchar (b[1])))
{
++b;
esc_value = esc_value * 16 + hextobin (*b);
--
2.39.2
From 2f3d9524bb4d803e5adcf91f8cb2f068fe912c44 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 27 Oct 2023 08:56:39 -0700
Subject: [PATCH 02/11] digest: omit unnecessary b2sum includes
* src/blake2/b2sum.c: Do not include string.h, errno.h,
ctype.h, unistd.h, getopt.h.
---
src/blake2/b2sum.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/src/blake2/b2sum.c b/src/blake2/b2sum.c
index 1a7e99f0e..5d69ff8d4 100644
--- a/src/blake2/b2sum.c
+++ b/src/blake2/b2sum.c
@@ -19,12 +19,6 @@
#include <stdio.h>
#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-
-#include <ctype.h>
-#include <unistd.h>
-#include <getopt.h>
#include "blake2.h"
@@ -133,6 +127,11 @@ cleanup_buffer:
#if 0
+#include <errno.h>
+#include <getopt.h>
+#include <string.h>
+#include <unistd.h>
+
int blake2sp_stream( FILE *stream, void *resstream, size_t outbytes )
{
int ret = -1;
--
2.39.2
From 684e810ae2de35dd2761bc28149280a249810d5b Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 27 Oct 2023 17:15:08 -0700
Subject: [PATCH 03/11] maint: move field_sep into separate module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This is so that we don’t need to have every source file
include ctype.h.
* bootstrap.conf (gnulib_modules): Add cu-ctype.
* gl/lib/cu-ctype.c, gl/lib/cu-ctype.h, gl/modules/cu-ctype:
New files.
* src/join.c, src/numfmt.c, src/sort.c, src/uniq.c:
Include cu-ctype.h, for field_sep.
* src/system.h (field_sep): Remove; now supplied by cu-ctype.
---
bootstrap.conf | 1 +
gl/lib/cu-ctype.c | 3 +++
gl/lib/cu-ctype.h | 35 +++++++++++++++++++++++++++++++++++
gl/modules/cu-ctype | 24 ++++++++++++++++++++++++
src/join.c | 1 +
src/numfmt.c | 1 +
src/sort.c | 1 +
src/system.h | 7 -------
src/uniq.c | 1 +
9 files changed, 67 insertions(+), 7 deletions(-)
create mode 100644 gl/lib/cu-ctype.c
create mode 100644 gl/lib/cu-ctype.h
create mode 100644 gl/modules/cu-ctype
diff --git a/bootstrap.conf b/bootstrap.conf
index db0c90c67..4724544d7 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -70,6 +70,7 @@ gnulib_modules="
crypto/sha256
crypto/sha512
crypto/sm3
+ cu-ctype
cycle-check
d-ino
d-type
diff --git a/gl/lib/cu-ctype.c b/gl/lib/cu-ctype.c
new file mode 100644
index 000000000..9f753de2e
--- /dev/null
+++ b/gl/lib/cu-ctype.c
@@ -0,0 +1,3 @@
+#include <config.h>
+#define CU_CTYPE_INLINE _GL_EXTERN_INLINE
+#include <cu-ctype.h>
diff --git a/gl/lib/cu-ctype.h b/gl/lib/cu-ctype.h
new file mode 100644
index 000000000..82f1d73f2
--- /dev/null
+++ b/gl/lib/cu-ctype.h
@@ -0,0 +1,35 @@
+/* Character type definitions for coreutils
+
+ Copyright 2023 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include <ctype.h>
+
+#ifndef _GL_INLINE_HEADER_BEGIN
+# error "Please include config.h first."
+#endif
+_GL_INLINE_HEADER_BEGIN
+#ifndef CU_CTYPE_INLINE
+# define CU_CTYPE_INLINE _GL_INLINE
+#endif
+
+/* '\n' is considered a field separator with --zero-terminated. */
+CU_CTYPE_INLINE bool
+field_sep (unsigned char ch)
+{
+ return isblank (ch) || ch == '\n';
+}
+
+_GL_INLINE_HEADER_END
diff --git a/gl/modules/cu-ctype b/gl/modules/cu-ctype
new file mode 100644
index 000000000..bd328b32e
--- /dev/null
+++ b/gl/modules/cu-ctype
@@ -0,0 +1,24 @@
+Description:
+ctype.h-like definitions for coreutils
+
+Files:
+lib/cu-ctype.c
+lib/cu-ctype.h
+
+Depends-on:
+ctype
+extern-inline
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += cu-ctype.c
+
+Include:
+"cu-ctype.h"
+
+License:
+GPL
+
+Maintainer:
+all
diff --git a/src/join.c b/src/join.c
index 7eef58c0b..b95cf2b9b 100644
--- a/src/join.c
+++ b/src/join.c
@@ -23,6 +23,7 @@
#include "system.h"
#include "assure.h"
+#include "cu-ctype.h"
#include "fadvise.h"
#include "hard-locale.h"
#include "linebuffer.h"
diff --git a/src/numfmt.c b/src/numfmt.c
index 8fd6e77ad..2ce70226c 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -24,6 +24,7 @@
#include "argmatch.h"
#include "c-ctype.h"
+#include "cu-ctype.h"
#include "mbswidth.h"
#include "quote.h"
#include "system.h"
diff --git a/src/sort.c b/src/sort.c
index 5c86b8332..6856e6151 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -31,6 +31,7 @@
#include "system.h"
#include "argmatch.h"
#include "assure.h"
+#include "cu-ctype.h"
#include "fadvise.h"
#include "filevercmp.h"
#include "flexmember.h"
diff --git a/src/system.h b/src/system.h
index 21b15839b..b4e0a7275 100644
--- a/src/system.h
+++ b/src/system.h
@@ -158,13 +158,6 @@ enum
errors that the cast doesn't. */
static inline unsigned char to_uchar (char ch) { return ch; }
-/* '\n' is considered a field separator with --zero-terminated. */
-static inline bool
-field_sep (unsigned char ch)
-{
- return isblank (ch) || ch == '\n';
-}
-
#include <locale.h>
/* Take care of NLS matters. */
diff --git a/src/uniq.c b/src/uniq.c
index d294ed665..7e177ac5a 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -23,6 +23,7 @@
#include "system.h"
#include "argmatch.h"
+#include "cu-ctype.h"
#include "linebuffer.h"
#include "fadvise.h"
#include "posixver.h"
--
2.39.2
From 4edb14d20f972595fd08f841b94f7454752e2b5f Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 27 Oct 2023 17:31:49 -0700
Subject: [PATCH 04/11] maint: include ctype.h selectively
Include ctype.h only in files that need it. Many of its uses
are incorrect, as they assume single-byte locales. The idea is
to remove the incorrect uses later, when there is time.
* src/chroot.c, src/csplit.c, src/dd.c, src/digest.c, src/dircolors.c:
* src/expand-common.c, src/expand.c, src/fmt.c, src/fold.c, src/ls.c:
* src/od.c, src/pinky.c, src/pr.c, src/ptx.c, src/seq.c:
* src/set-fields.c, src/split.c, src/stdbuf.c, src/test.c:
* src/tr.c, src/truncate.c, src/unexpand.c, src/wc.c:
Include ctype.h.
* src/system.h: Do not include ctype.h.
include ctype.h.o
---
src/chroot.c | 1 +
src/csplit.c | 1 +
src/dd.c | 1 +
src/digest.c | 1 +
src/dircolors.c | 1 +
src/expand-common.c | 1 +
src/expand.c | 1 +
src/fmt.c | 1 +
src/fold.c | 1 +
src/ls.c | 1 +
src/od.c | 1 +
src/pinky.c | 1 +
src/pr.c | 1 +
src/ptx.c | 1 +
src/seq.c | 1 +
src/set-fields.c | 1 +
src/split.c | 1 +
src/stdbuf.c | 1 +
src/system.h | 2 --
src/test.c | 1 +
src/tr.c | 1 +
src/truncate.c | 1 +
src/unexpand.c | 1 +
src/wc.c | 1 +
24 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/src/chroot.c b/src/chroot.c
index 6150af5cd..17af5ebe4 100644
--- a/src/chroot.c
+++ b/src/chroot.c
@@ -17,6 +17,7 @@
/* Written by Roland McGrath. */
#include <config.h>
+#include <ctype.h>
#include <getopt.h>
#include <stdio.h>
#include <sys/types.h>
diff --git a/src/csplit.c b/src/csplit.c
index dca525aaf..32fb96bca 100644
--- a/src/csplit.c
+++ b/src/csplit.c
@@ -19,6 +19,7 @@
#include <config.h>
+#include <ctype.h>
#include <getopt.h>
#include <sys/types.h>
#include <signal.h>
diff --git a/src/dd.c b/src/dd.c
index 595b8755b..85ea26a3f 100644
--- a/src/dd.c
+++ b/src/dd.c
@@ -18,6 +18,7 @@
#include <config.h>
+#include <ctype.h>
#include <sys/types.h>
#include <signal.h>
#include <stdckdint.h>
diff --git a/src/digest.c b/src/digest.c
index 1f3695308..336392608 100644
--- a/src/digest.c
+++ b/src/digest.c
@@ -18,6 +18,7 @@
#include <config.h>
+#include <ctype.h>
#include <getopt.h>
#include <sys/types.h>
diff --git a/src/dircolors.c b/src/dircolors.c
index 8a86efb76..f9001de07 100644
--- a/src/dircolors.c
+++ b/src/dircolors.c
@@ -17,6 +17,7 @@
#include <config.h>
+#include <ctype.h>
#include <sys/types.h>
#include <fnmatch.h>
#include <getopt.h>
diff --git a/src/expand-common.c b/src/expand-common.c
index 89fa56ace..16240802d 100644
--- a/src/expand-common.c
+++ b/src/expand-common.c
@@ -16,6 +16,7 @@
#include <config.h>
+#include <ctype.h>
#include <stdio.h>
#include <sys/types.h>
#include "system.h"
diff --git a/src/expand.c b/src/expand.c
index 0e74d0cf6..00f2119c6 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -34,6 +34,7 @@
#include <config.h>
+#include <ctype.h>
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
diff --git a/src/fmt.c b/src/fmt.c
index ad7a9ce56..b6fe74630 100644
--- a/src/fmt.c
+++ b/src/fmt.c
@@ -17,6 +17,7 @@
/* Written by Ross Paterson <r...@doc.ic.ac.uk>. */
#include <config.h>
+#include <ctype.h>
#include <stdio.h>
#include <sys/types.h>
#include <getopt.h>
diff --git a/src/fold.c b/src/fold.c
index 5c0428d80..1a3859097 100644
--- a/src/fold.c
+++ b/src/fold.c
@@ -18,6 +18,7 @@
#include <config.h>
+#include <ctype.h>
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
diff --git a/src/ls.c b/src/ls.c
index 769ae85a7..e16972d3e 100644
--- a/src/ls.c
+++ b/src/ls.c
@@ -36,6 +36,7 @@
Greg Lee <l...@uhunix.uhcc.hawaii.edu>. */
#include <config.h>
+#include <ctype.h>
#include <sys/types.h>
#include <termios.h>
diff --git a/src/od.c b/src/od.c
index 538175af7..951e88652 100644
--- a/src/od.c
+++ b/src/od.c
@@ -18,6 +18,7 @@
#include <config.h>
+#include <ctype.h>
#include <stdckdint.h>
#include <stdio.h>
#include <getopt.h>
diff --git a/src/pinky.c b/src/pinky.c
index db0d2557b..8c872b2fe 100644
--- a/src/pinky.c
+++ b/src/pinky.c
@@ -17,6 +17,7 @@
/* Created by hacking who.c by Kaveh Ghazi gh...@caip.rutgers.edu */
#include <config.h>
+#include <ctype.h>
#include <getopt.h>
#include <pwd.h>
#include <stdckdint.h>
diff --git a/src/pr.c b/src/pr.c
index 57361d629..7e680e23c 100644
--- a/src/pr.c
+++ b/src/pr.c
@@ -309,6 +309,7 @@
#include <config.h>
+#include <ctype.h>
#include <getopt.h>
#include <stdckdint.h>
#include <sys/types.h>
diff --git a/src/ptx.c b/src/ptx.c
index 3cd84b2e9..c1524fed7 100644
--- a/src/ptx.c
+++ b/src/ptx.c
@@ -19,6 +19,7 @@
#include <config.h>
+#include <ctype.h>
#include <getopt.h>
#include <sys/types.h>
#include "system.h"
diff --git a/src/seq.c b/src/seq.c
index 2822d5c2c..96d14be1c 100644
--- a/src/seq.c
+++ b/src/seq.c
@@ -17,6 +17,7 @@
/* Written by Ulrich Drepper. */
#include <config.h>
+#include <ctype.h>
#include <getopt.h>
#include <stdio.h>
#include <sys/types.h>
diff --git a/src/set-fields.c b/src/set-fields.c
index b299280c0..a524b7fa4 100644
--- a/src/set-fields.c
+++ b/src/set-fields.c
@@ -19,6 +19,7 @@
#include <config.h>
#include "system.h"
+#include <ctype.h>
#include "quote.h"
#include "set-fields.h"
diff --git a/src/split.c b/src/split.c
index d2cd23234..f56a144a6 100644
--- a/src/split.c
+++ b/src/split.c
@@ -21,6 +21,7 @@
* support --suppress-matched as in csplit. */
#include <config.h>
+#include <ctype.h>
#include <stdckdint.h>
#include <stdio.h>
#include <getopt.h>
diff --git a/src/stdbuf.c b/src/stdbuf.c
index 1ec23cf8b..51326ad4e 100644
--- a/src/stdbuf.c
+++ b/src/stdbuf.c
@@ -17,6 +17,7 @@
/* Written by Pádraig Brady. */
#include <config.h>
+#include <ctype.h>
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
diff --git a/src/system.h b/src/system.h
index b4e0a7275..8c2a4fd8b 100644
--- a/src/system.h
+++ b/src/system.h
@@ -142,8 +142,6 @@ enum
#include "timespec.h"
-#include <ctype.h>
-
/* ISDIGIT differs from isdigit, as follows:
- Its arg may be any int or unsigned int; it need not be an unsigned char
or EOF.
diff --git a/src/test.c b/src/test.c
index a4eb40a52..2bcb9abc8 100644
--- a/src/test.c
+++ b/src/test.c
@@ -27,6 +27,7 @@
#endif
#include <config.h>
+#include <ctype.h>
#include <stdio.h>
#include <sys/types.h>
diff --git a/src/tr.c b/src/tr.c
index 625c27583..292aae1d4 100644
--- a/src/tr.c
+++ b/src/tr.c
@@ -18,6 +18,7 @@
#include <config.h>
+#include <ctype.h>
#include <stdio.h>
#include <sys/types.h>
#include <getopt.h>
diff --git a/src/truncate.c b/src/truncate.c
index 4a828e1da..040172c4d 100644
--- a/src/truncate.c
+++ b/src/truncate.c
@@ -21,6 +21,7 @@
to better fit the "GNU" environment. */
#include <config.h> /* sets _FILE_OFFSET_BITS=64 etc. */
+#include <ctype.h>
#include <stdckdint.h>
#include <stdio.h>
#include <getopt.h>
diff --git a/src/unexpand.c b/src/unexpand.c
index 5a2283fdd..46e943365 100644
--- a/src/unexpand.c
+++ b/src/unexpand.c
@@ -35,6 +35,7 @@
#include <config.h>
+#include <ctype.h>
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
diff --git a/src/wc.c b/src/wc.c
index e69ad0d51..43170cf9b 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -19,6 +19,7 @@
#include <config.h>
+#include <ctype.h>
#include <stdckdint.h>
#include <stdio.h>
#include <getopt.h>
--
2.39.2
From 5602342a16e81be25ec00b12af847fc0c72f6589 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 28 Oct 2023 09:07:14 -0700
Subject: [PATCH 05/11] maint: port to oddball tolower
* src/digest.c (hex_equal): Work even in oddball locales
where tolower does not work as expected on ASCII letters.
---
src/digest.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/src/digest.c b/src/digest.c
index 336392608..052fa4db7 100644
--- a/src/digest.c
+++ b/src/digest.c
@@ -18,7 +18,6 @@
#include <config.h>
-#include <ctype.h>
#include <getopt.h>
#include <sys/types.h>
@@ -1122,9 +1121,9 @@ hex_equal (unsigned char const *hex_digest, unsigned char const *bin_buffer)
size_t cnt;
for (cnt = 0; cnt < digest_bin_bytes; ++cnt)
{
- if (tolower (hex_digest[2 * cnt])
+ if (c_tolower (hex_digest[2 * cnt])
!= bin2hex[bin_buffer[cnt] >> 4]
- || (tolower (hex_digest[2 * cnt + 1])
+ || (c_tolower (hex_digest[2 * cnt + 1])
!= (bin2hex[bin_buffer[cnt] & 0xf])))
break;
}
--
2.39.2
From 8d60cd8ad69a0c0cd0dcd86e774157bddb41cb79 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 28 Oct 2023 09:22:09 -0700
Subject: [PATCH 06/11] dircolors: assume C-locale spaces
* src/dircolors.c: Include c-ctype.h, not ctype.h.
(parse_line): Use c_isspace, not isspace, as the .dircolors
file format (which does not seem to be documented!) appears
to be ASCII.
---
src/dircolors.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/dircolors.c b/src/dircolors.c
index f9001de07..75ea51603 100644
--- a/src/dircolors.c
+++ b/src/dircolors.c
@@ -17,13 +17,13 @@
#include <config.h>
-#include <ctype.h>
#include <sys/types.h>
#include <fnmatch.h>
#include <getopt.h>
#include "system.h"
#include "dircolors.h"
+#include "c-ctype.h"
#include "c-strcase.h"
#include "obstack.h"
#include "quote.h"
@@ -153,7 +153,7 @@ parse_line (char const *line, char **keyword, char **arg)
*keyword = nullptr;
*arg = nullptr;
- for (p = line; isspace (to_uchar (*p)); ++p)
+ for (p = line; c_isspace (to_uchar (*p)); ++p)
continue;
/* Ignore blank lines and shell-style comments. */
@@ -162,7 +162,7 @@ parse_line (char const *line, char **keyword, char **arg)
keyword_start = p;
- while (!isspace (to_uchar (*p)) && *p != '\0')
+ while (!c_isspace (to_uchar (*p)) && *p != '\0')
{
++p;
}
@@ -175,7 +175,7 @@ parse_line (char const *line, char **keyword, char **arg)
{
++p;
}
- while (isspace (to_uchar (*p)));
+ while (c_isspace (to_uchar (*p)));
if (*p == '\0' || *p == '#')
return;
@@ -185,7 +185,7 @@ parse_line (char const *line, char **keyword, char **arg)
while (*p != '\0' && *p != '#')
++p;
- for (--p; isspace (to_uchar (*p)); --p)
+ for (--p; c_isspace (to_uchar (*p)); --p)
continue;
++p;
--
2.39.2
From a3ce33c106c3db936deb3bfa9784d6e53f921233 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 28 Oct 2023 09:30:49 -0700
Subject: [PATCH 07/11] stdbuf: port to oddball toupper
* src/stdbuf.c: Do not include ctype.h.
(set_libstdbuf_options): Use c_toupper, not toupper,
since the C locale is intended here.
---
src/stdbuf.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/src/stdbuf.c b/src/stdbuf.c
index 51326ad4e..65142fd8c 100644
--- a/src/stdbuf.c
+++ b/src/stdbuf.c
@@ -17,7 +17,6 @@
/* Written by Pádraig Brady. */
#include <config.h>
-#include <ctype.h>
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
@@ -286,10 +285,10 @@ set_libstdbuf_options (void)
if (*stdbuf[i].optarg == 'L')
ret = asprintf (&var, "%s%c=L", "_STDBUF_",
- toupper (stdbuf[i].optc));
+ c_toupper (stdbuf[i].optc));
else
ret = asprintf (&var, "%s%c=%zu", "_STDBUF_",
- toupper (stdbuf[i].optc),
+ c_toupper (stdbuf[i].optc),
stdbuf[i].size);
if (ret < 0)
xalloc_die ();
--
2.39.2
From 2709bea0f440507ac009e6e7ded453bb792d6842 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 28 Oct 2023 16:15:49 -0700
Subject: [PATCH 08/11] test: allow non-blank white space in numbers
* src/test.c (find_int): Use isspace, not isblank,
for compatibility with how strtol works, which
is how most other shells do this.
---
src/test.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/test.c b/src/test.c
index 2bcb9abc8..4f14e4080 100644
--- a/src/test.c
+++ b/src/test.c
@@ -136,7 +136,7 @@ find_int (char const *string)
char const *p;
char const *number_start;
- for (p = string; isblank (to_uchar (*p)); p++)
+ for (p = string; isspace (to_uchar (*p)); p++)
continue;
if (*p == '+')
@@ -154,7 +154,7 @@ find_int (char const *string)
{
while (ISDIGIT (*p))
p++;
- while (isblank (to_uchar (*p)))
+ while (isspace (to_uchar (*p)))
p++;
if (!*p)
return number_start;
--
2.39.2
From 11b01fc21f1dff2685477c03596a0a4009aec7da Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 30 Oct 2023 00:32:51 -0700
Subject: [PATCH 09/11] join,uniq: support multi-byte separators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* NEWS: Mention this.
* bootstrap.conf (gnulib_modules): Remove cu-ctype, as this module
is now more trouble than it’s worth. All uses removed.
Add skipchars.
* gl/lib/cu-ctype.c, gl/lib/cu-ctype.h, gl/modules/cu-ctype:
Remove.
* gl/lib/skipchars.c, gl/lib/skipchars.h, gl/modules/skipchars:
* tests/misc/join-utf8.sh:
New files.
* src/join.c: Include skipchars.h and mcel.h instead of cu-ctype.h.
(tab): Now mcel_t, not int. All uses changed.
(output_separator, output_seplen): New static vars.
(eq_tab, newline_or_blank, comma_or_blank): New functions.
(xfields, prfields, prjoin, add_field_list, main):
Support multi-byte characters.
* src/numfmt.c: Include ctype.h, skipchars.h.
Do not include cu-ctype.h.
(newline_or_blank): New function.
(next_field): Support multi-byte characters.
* src/sort.c: Include ctype.h instead of cu-ctype.h.
(inittables): Open-code field_sep since it no longer exists.
‘sort’ is not multi-byte safe yet, but when it is this code
will need revamping anyway.
* src/uniq.c: Include mcel.h and skipchars.h instead of cu-ctype.h.
(newline_or_blank): New function.
(find_field): Support multi-byte characters.
* tests/local.mk (all_tests): Add tests/misc/join-utf8.sh
---
NEWS | 5 ++
bootstrap.conf | 2 +-
gl/lib/cu-ctype.c | 3 -
gl/lib/cu-ctype.h | 35 ------------
gl/lib/skipchars.c | 3 +
gl/lib/skipchars.h | 56 +++++++++++++++++++
gl/modules/cu-ctype | 24 --------
gl/modules/skipchars | 24 ++++++++
src/join.c | 119 +++++++++++++++++++++++++---------------
src/numfmt.c | 16 ++++--
src/sort.c | 6 +-
src/uniq.c | 27 +++++----
tests/local.mk | 1 +
tests/misc/join-utf8.sh | 51 +++++++++++++++++
14 files changed, 244 insertions(+), 128 deletions(-)
delete mode 100644 gl/lib/cu-ctype.c
delete mode 100644 gl/lib/cu-ctype.h
create mode 100644 gl/lib/skipchars.c
create mode 100644 gl/lib/skipchars.h
delete mode 100644 gl/modules/cu-ctype
create mode 100644 gl/modules/skipchars
create mode 100755 tests/misc/join-utf8.sh
diff --git a/NEWS b/NEWS
index 3021211dc..b1088f683 100644
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,11 @@ GNU coreutils NEWS -*- outline -*-
to preserve ownership" when copying to GNU/Linux CIFS file systems.
They do this by working around some Linux CIFS bugs.
+ join and uniq now support multi-byte characters better.
+ For example, 'join -tX' now works even if X is a multi-byte character,
+ and both programs now treat multi-byte characters like U+3000
+ IDEOGRAPHIC SPACE as blanks if the current locale treats them so.
+
numfmt options like --suffix no longer have an arbitrary 127-byte limit.
[bug introduced with numfmt in coreutils-8.21]
diff --git a/bootstrap.conf b/bootstrap.conf
index 4724544d7..97645d6f0 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -70,7 +70,6 @@ gnulib_modules="
crypto/sha256
crypto/sha512
crypto/sm3
- cu-ctype
cycle-check
d-ino
d-type
@@ -241,6 +240,7 @@ gnulib_modules="
settime
sig2str
sigaction
+ skipchars
smack
ssize_t
stat-macros
diff --git a/gl/lib/cu-ctype.c b/gl/lib/cu-ctype.c
deleted file mode 100644
index 9f753de2e..000000000
--- a/gl/lib/cu-ctype.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#include <config.h>
-#define CU_CTYPE_INLINE _GL_EXTERN_INLINE
-#include <cu-ctype.h>
diff --git a/gl/lib/cu-ctype.h b/gl/lib/cu-ctype.h
deleted file mode 100644
index 82f1d73f2..000000000
--- a/gl/lib/cu-ctype.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Character type definitions for coreutils
-
- Copyright 2023 Free Software Foundation, Inc.
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <https://www.gnu.org/licenses/>. */
-
-#include <ctype.h>
-
-#ifndef _GL_INLINE_HEADER_BEGIN
-# error "Please include config.h first."
-#endif
-_GL_INLINE_HEADER_BEGIN
-#ifndef CU_CTYPE_INLINE
-# define CU_CTYPE_INLINE _GL_INLINE
-#endif
-
-/* '\n' is considered a field separator with --zero-terminated. */
-CU_CTYPE_INLINE bool
-field_sep (unsigned char ch)
-{
- return isblank (ch) || ch == '\n';
-}
-
-_GL_INLINE_HEADER_END
diff --git a/gl/lib/skipchars.c b/gl/lib/skipchars.c
new file mode 100644
index 000000000..827c89d45
--- /dev/null
+++ b/gl/lib/skipchars.c
@@ -0,0 +1,3 @@
+#include <config.h>
+#define SKIPCHARS_INLINE _GL_EXTERN_INLINE
+#include <skipchars.h>
diff --git a/gl/lib/skipchars.h b/gl/lib/skipchars.h
new file mode 100644
index 000000000..baa9eaba6
--- /dev/null
+++ b/gl/lib/skipchars.h
@@ -0,0 +1,56 @@
+/* Skipping sequences of characters satisfying a predicate
+
+ Copyright 2023 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include "mcel.h"
+
+_GL_INLINE_HEADER_BEGIN
+#ifndef SKIPCHARS_INLINE
+# define SKIPCHARS_INLINE _GL_INLINE
+#endif
+
+/* Return the address just past the leading sequence of possibly
+ multi-byte characters or encoding errors G in STR that satisfy
+ PREDICATE (G) if OK is true, or that do not satisfy the predicate
+ call if OK is false. */
+
+SKIPCHARS_INLINE char *
+skip_str_matching (char const *str, bool (*predicate) (mcel_t), bool ok)
+{
+ char const *s = str;
+ for (mcel_t g; *s && predicate (g = mcel_scanz (s)) == ok;
+ s += g.len)
+ continue;
+ return (char *) s;
+}
+
+/* Return the address just past the leading sequence of possibly
+ multi-byte characters or encoding errors G in BUF (which ends at LIM)
+ that satisfy PREDICATE (G) if OK is true, or that do not satisfy
+ the predicate call if OK is false. */
+
+SKIPCHARS_INLINE char *
+skip_buf_matching (char const *buf, char const *lim,
+ bool (*predicate) (mcel_t), bool ok)
+{
+ char const *s = buf;
+ for (mcel_t g; s < lim && predicate (g = mcel_scan (s, lim)) == ok;
+ s += g.len)
+ continue;
+ return (char *) s;
+}
+
+_GL_INLINE_HEADER_END
diff --git a/gl/modules/cu-ctype b/gl/modules/cu-ctype
deleted file mode 100644
index bd328b32e..000000000
--- a/gl/modules/cu-ctype
+++ /dev/null
@@ -1,24 +0,0 @@
-Description:
-ctype.h-like definitions for coreutils
-
-Files:
-lib/cu-ctype.c
-lib/cu-ctype.h
-
-Depends-on:
-ctype
-extern-inline
-
-configure.ac:
-
-Makefile.am:
-lib_SOURCES += cu-ctype.c
-
-Include:
-"cu-ctype.h"
-
-License:
-GPL
-
-Maintainer:
-all
diff --git a/gl/modules/skipchars b/gl/modules/skipchars
new file mode 100644
index 000000000..3b25fd6eb
--- /dev/null
+++ b/gl/modules/skipchars
@@ -0,0 +1,24 @@
+Description:
+Skip sequences of multi-byte characters or encoding errors
+
+Files:
+lib/skipchars.c
+lib/skipchars.h
+
+Depends-on:
+extern-inline
+mcel
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += skipchars.c
+
+Include:
+"skipchars.h"
+
+License:
+GPL
+
+Maintainer:
+all
diff --git a/src/join.c b/src/join.c
index b95cf2b9b..b3ad27465 100644
--- a/src/join.c
+++ b/src/join.c
@@ -23,12 +23,13 @@
#include "system.h"
#include "assure.h"
-#include "cu-ctype.h"
#include "fadvise.h"
#include "hard-locale.h"
#include "linebuffer.h"
+#include "mcel.h"
#include "memcasecmp.h"
#include "quote.h"
+#include "skipchars.h"
#include "stdio--.h"
#include "xmemcoll.h"
#include "xstrtol.h"
@@ -135,10 +136,14 @@ static struct outlist outlist_head;
/* Last element in 'outlist', where a new element can be added. */
static struct outlist *outlist_end = &outlist_head;
-/* Tab character separating fields. If negative, fields are separated
- by any nonempty string of blanks, otherwise by exactly one
- tab character whose value (when cast to unsigned char) equals TAB. */
-static int tab = -1;
+/* Tab character (or encoding error) separating fields. If TAB.len == 0,
+ fields are separated by any nonempty string of blanks, otherwise by
+ exactly one tab character (or encoding error) equal to TAB. */
+static mcel_t tab;
+
+/* The output separator to use, and its length in bytes. */
+static char const *output_separator = " ";
+static idx_t output_seplen = 1;
/* If nonzero, check that the input is correctly ordered. */
static enum
@@ -267,6 +272,18 @@ extract_field (struct line *line, char *field, idx_t len)
++(line->nfields);
}
+static bool
+eq_tab (mcel_t g)
+{
+ return mcel_cmp (g, tab) == 0;
+}
+
+static bool
+newline_or_blank (mcel_t g)
+{
+ return g.ch == '\n' || c32isblank (g.ch);
+}
+
/* Fill in the 'fields' structure in LINE. */
static void
@@ -278,34 +295,29 @@ xfields (struct line *line)
if (ptr == lim)
return;
- if (0 <= tab && tab != '\n')
- {
- char *sep;
- for (; (sep = memchr (ptr, tab, lim - ptr)) != nullptr; ptr = sep + 1)
- extract_field (line, ptr, sep - ptr);
- }
- else if (tab < 0)
+ if (!tab.len)
{
- /* Skip leading blanks before the first field. */
- while (field_sep (*ptr))
- if (++ptr == lim)
- return;
-
- do
+ while (ptr < lim)
{
- char *sep;
- for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
- continue;
+ ptr = skip_buf_matching (ptr, lim, newline_or_blank, true);
+ if (!*ptr)
+ break;
+ char *sep = skip_buf_matching (ptr, lim, newline_or_blank, false);
extract_field (line, ptr, sep - ptr);
- if (sep == lim)
- return;
- for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
- continue;
+ ptr = sep;
}
- while (ptr != lim);
}
+ else
+ {
+ if (tab.ch != '\n')
+ for (char *sep;
+ ((sep = skip_buf_matching (ptr, lim, eq_tab, false))
+ < lim);
+ ptr = sep + mcel_scan (sep, lim).len)
+ extract_field (line, ptr, sep - ptr);
- extract_field (line, ptr, lim - ptr);
+ extract_field (line, ptr, lim - ptr);
+ }
}
static void
@@ -568,16 +580,15 @@ prfields (struct line const *line, idx_t join_field, idx_t autocount)
{
idx_t i;
idx_t nfields = autoformat ? autocount : line->nfields;
- char output_separator = tab < 0 ? ' ' : tab;
for (i = 0; i < join_field && i < nfields; ++i)
{
- putchar (output_separator);
+ fwrite (output_separator, 1, output_seplen, stdout);
prfield (i, line);
}
for (i = join_field + 1; i < nfields; ++i)
{
- putchar (output_separator);
+ fwrite (output_separator, 1, output_seplen, stdout);
prfield (i, line);
}
}
@@ -588,7 +599,6 @@ static void
prjoin (struct line const *line1, struct line const *line2)
{
const struct outlist *outlist;
- char output_separator = tab < 0 ? ' ' : tab;
idx_t field;
struct line const *line;
@@ -622,7 +632,7 @@ prjoin (struct line const *line1, struct line const *line2)
o = o->next;
if (o == nullptr)
break;
- putchar (output_separator);
+ fwrite (output_separator, 1, output_seplen, stdout);
}
putchar (eolchar);
}
@@ -886,6 +896,12 @@ decode_field_spec (char const *s, int *file_index, idx_t *field_index)
}
}
+static bool
+comma_or_blank (mcel_t g)
+{
+ return g.ch == ',' || c32isblank (g.ch);
+}
+
/* Add the comma or blank separated field spec(s) in STR to 'outlist'. */
static void
@@ -898,14 +914,17 @@ add_field_list (char *str)
int file_index;
idx_t field_index;
char const *spec_item = p;
-
- p = strpbrk (p, ", \t");
- if (p)
- *p++ = '\0';
+ p = skip_str_matching (spec_item, comma_or_blank, false);
+ if (*p)
+ {
+ mcel_t g = mcel_scanz (p);
+ *p = '\0';
+ p += g.len;
+ }
decode_field_spec (spec_item, &file_index, &field_index);
add_field (file_index, field_index);
}
- while (p);
+ while (*p);
}
/* Set the join field *VAR to VAL, but report an error if *VAR is set
@@ -1087,20 +1106,30 @@ main (int argc, char **argv)
case 't':
{
- unsigned char newtab = optarg[0];
- if (! newtab)
- newtab = '\n'; /* '' => process the whole line. */
- else if (optarg[1])
+ mcel_t newtab;
+ if (!*optarg)
+ {
+ /* '' => process the whole line. */
+ newtab = mcel_ch ('\n', 1);
+ /* output_separator does not matter. */
+ }
+ else if (STREQ (optarg, "\\0"))
+ {
+ newtab = mcel_ch ('\0', 1);
+ output_separator = "";
+ }
+ else
{
- if (STREQ (optarg, "\\0"))
- newtab = '\0';
- else
+ newtab = mcel_scanz (optarg);
+ if (optarg[newtab.len])
error (EXIT_FAILURE, 0, _("multi-character tab %s"),
quote (optarg));
+ output_separator = optarg;
}
- if (0 <= tab && tab != newtab)
+ if (tab.len && mcel_cmp (tab, newtab) != 0)
error (EXIT_FAILURE, 0, _("incompatible tabs"));
tab = newtab;
+ output_seplen = newtab.len;
}
break;
diff --git a/src/numfmt.c b/src/numfmt.c
index 2ce70226c..7b53c87e4 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -15,6 +15,7 @@
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#include <config.h>
+#include <ctype.h>
#include <float.h>
#include <getopt.h>
#include <stdckdint.h>
@@ -24,9 +25,9 @@
#include "argmatch.h"
#include "c-ctype.h"
-#include "cu-ctype.h"
#include "mbswidth.h"
#include "quote.h"
+#include "skipchars.h"
#include "system.h"
#include "xstrtol.h"
@@ -1314,6 +1315,12 @@ process_suffixed_number (char *text, long double *result,
return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS);
}
+static bool
+newline_or_blank (mcel_t g)
+{
+ return g.ch == '\n' || c32isblank (g.ch);
+}
+
/* Return a pointer to the beginning of the next field in line.
The line pointer is moved to the end of the next field. */
static char*
@@ -1334,11 +1341,8 @@ next_field (char **line)
else
{
/* keep any space prefix in the returned field */
- while (*field_end && field_sep (*field_end))
- ++field_end;
-
- while (*field_end && ! field_sep (*field_end))
- ++field_end;
+ field_end = skip_str_matching (field_end, newline_or_blank, true);
+ field_end = skip_str_matching (field_end, newline_or_blank, false);
}
*line = field_end;
diff --git a/src/sort.c b/src/sort.c
index 6856e6151..829b17f42 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -22,6 +22,7 @@
#include <config.h>
+#include <ctype.h>
#include <getopt.h>
#include <pthread.h>
#include <sys/resource.h>
@@ -31,7 +32,6 @@
#include "system.h"
#include "argmatch.h"
#include "assure.h"
-#include "cu-ctype.h"
#include "fadvise.h"
#include "filevercmp.h"
#include "flexmember.h"
@@ -1293,9 +1293,9 @@ inittables (void)
for (i = 0; i < UCHAR_LIM; ++i)
{
- blanks[i] = field_sep (i);
+ blanks[i] = i == '\n' || isblank (i);
+ nondictionary[i] = ! blanks[i] && ! isalnum (i);
nonprinting[i] = ! isprint (i);
- nondictionary[i] = ! isalnum (i) && ! field_sep (i);
fold_toupper[i] = toupper (i);
}
diff --git a/src/uniq.c b/src/uniq.c
index 7e177ac5a..7dc0c999a 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -23,10 +23,11 @@
#include "system.h"
#include "argmatch.h"
-#include "cu-ctype.h"
#include "linebuffer.h"
#include "fadvise.h"
+#include "mcel.h"
#include "posixver.h"
+#include "skipchars.h"
#include "stdio--.h"
#include "xstrtol.h"
#include "memcasecmp.h"
@@ -248,6 +249,12 @@ size_opt (char const *opt, char const *msgid)
return MIN (size, SIZE_MAX);
}
+static bool
+newline_or_blank (mcel_t g)
+{
+ return g.ch == '\n' || c32isblank (g.ch);
+}
+
/* Given a linebuffer LINE,
return a pointer to the beginning of the line's field to be compared. */
@@ -256,21 +263,19 @@ static char *
find_field (struct linebuffer const *line)
{
size_t count;
- char const *lp = line->buffer;
- size_t size = line->length - 1;
- size_t i = 0;
+ char *lp = line->buffer;
+ char const *lim = lp + line->length - 1;
- for (count = 0; count < skip_fields && i < size; count++)
+ for (count = 0; count < skip_fields && lp < lim; count++)
{
- while (i < size && field_sep (lp[i]))
- i++;
- while (i < size && !field_sep (lp[i]))
- i++;
+ lp = skip_buf_matching (lp, lim, newline_or_blank, true);
+ lp = skip_buf_matching (lp, lim, newline_or_blank, false);
}
- i += MIN (skip_chars, size - i);
+ for (size_t s = skip_chars; lp < lim && s; s--)
+ lp += mcel_scan (lp, lim).len;
- return line->buffer + i;
+ return lp;
}
/* Return false if two strings OLD and NEW match, true if not.
diff --git a/tests/local.mk b/tests/local.mk
index 79fea1f6e..a5fb62d96 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -271,6 +271,7 @@ all_tests = \
tests/misc/mktemp.pl \
tests/misc/arch.sh \
tests/misc/join.pl \
+ tests/misc/join-utf8.sh \
tests/pr/pr-tests.pl \
tests/pwd/pwd-option.sh \
tests/chcon/chcon-fail.sh \
diff --git a/tests/misc/join-utf8.sh b/tests/misc/join-utf8.sh
new file mode 100755
index 000000000..b70bff7f9
--- /dev/null
+++ b/tests/misc/join-utf8.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+# Test join in a UTF-8 locale.
+
+# Copyright 2023 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ join
+
+test "${LOCALE_FR_UTF8+set}" = set || skip_ "French UTF-8 locale not available"
+
+LC_ALL=$LOCALE_FR_UTF8
+export LC_ALL
+
+fail=0
+
+vertical_line='|'
+multiplication_sign='×'
+en_dash='–'
+old_Persian_word_divider='𐏐'
+
+for s in \
+ "$vertical_line" \
+ "$multiplication_sign" \
+ "$en_dash" \
+ "$old_Persian_word_divider"
+do
+ printf '0%sA\n1%sa\n2%sb\n4%sc\n' "$s" "$s" "$s" "$s" >a ||
+ framework_failure_
+ printf '0%sB\n1%sd\n3%se\n4%sf\n' "$s" "$s" "$s" "$s" >b ||
+ framework_failure_
+ join -t"$s" -a1 -a2 -eouch -o0,1.2,2.2 a b >out || fail=1
+ printf '0%sA%sB\n1%sa%sd\n2%sb%souch\n3%souch%se\n4%sc%sf\n' \
+ "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" >exp ||
+ framework_failure
+ compare exp out || fail=1
+done
+
+Exit $fail
--
2.39.2
From ba5017b65a45bd73ec156629e3796b6f3c33f95c Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 30 Oct 2023 01:24:28 -0700
Subject: [PATCH 10/11] maint: copy join, uniq tests from Fedora
* tests/misc/join.pl, tests/uniq/uniq.pl:
Copy from Fedora 39. This adds more multi-byte tests.
---
tests/misc/join.pl | 50 +++++++++++++++++++++++++++++++++++++++++
tests/uniq/uniq.pl | 55 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 105 insertions(+)
diff --git a/tests/misc/join.pl b/tests/misc/join.pl
index 2ca8567ba..1d01a3d3d 100755
--- a/tests/misc/join.pl
+++ b/tests/misc/join.pl
@@ -25,6 +25,15 @@ my $limits = getlimits ();
my $prog = 'join';
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
+my $mb_locale;
+#Comment out next line to disable multibyte tests
+$mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
my $delim = chr 0247;
sub t_subst ($)
{
@@ -333,8 +342,49 @@ foreach my $t (@tv)
push @Tests, $new_ent;
}
+# Add _POSIX2_VERSION=199209 to the environment of each test
+# that uses an old-style option like +1.
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether join is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ #Adjust the output some error messages including test_name for mb
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR}}
+ (@new_t))
+ {
+ my $sub2 = {ERR_SUBST => "s/$test_name-mb/$test_name/"};
+ push @new_t, $sub2;
+ push @$t, $sub2;
+ }
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
+
@Tests = triple_test \@Tests;
+#skip invalid-j-mb test, it is failing because of the format
+@Tests = grep {$_->[0] ne 'invalid-j-mb'} @Tests;
+
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
diff --git a/tests/uniq/uniq.pl b/tests/uniq/uniq.pl
index a6354dc3c..e43cd6e3f 100755
--- a/tests/uniq/uniq.pl
+++ b/tests/uniq/uniq.pl
@@ -23,9 +23,17 @@ my $limits = getlimits ();
my $prog = 'uniq';
my $try = "Try '$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
# Turn off localization of executable's output.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
+my $mb_locale;
+#Comment out next line to disable multibyte tests
+$mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
# When possible, create a "-z"-testing variant of each test.
sub add_z_variants($)
{
@@ -262,6 +270,53 @@ foreach my $t (@Tests)
and push @$t, {ENV=>'_POSIX2_VERSION=199209'};
}
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether uniq is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ # In test #145, replace the each ‘...’ by '...'.
+ if ($test_name =~ "145")
+ {
+ my $sub = { ERR_SUBST => "s/‘([^’]+)’/'\$1'/g"};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ next if ( $test_name =~ "schar"
+ or $test_name =~ "^obs-plus"
+ or $test_name =~ "119");
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
+
+# Remember that triple_test creates from each test with exactly one "IN"
+# file two more tests (.p and .r suffix on name) corresponding to reading
+# input from a file and from a pipe. The pipe-reading test would fail
+# due to a race condition about 1 in 20 times.
+# Remove the IN_PIPE version of the "output-is-input" test above.
+# The others aren't susceptible because they have three inputs each.
+
+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
+
@Tests = add_z_variants \@Tests;
@Tests = triple_test \@Tests;
--
2.39.2
From bd45f0963c42ee0d9d31b065d9e60e7435ed0523 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 30 Oct 2023 01:32:37 -0700
Subject: [PATCH 11/11] =?UTF-8?q?maint:=20pacify=20=E2=80=98make=20syntax-?=
=?UTF-8?q?check=E2=80=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* tests/misc/join-utf8.sh: Omit fail=0.
Fix framework_failure_ typo.
* tests/misc/join.pl: Change ` to '.
---
tests/misc/join-utf8.sh | 4 +---
tests/misc/join.pl | 2 +-
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/tests/misc/join-utf8.sh b/tests/misc/join-utf8.sh
index b70bff7f9..a2bc3b1e5 100755
--- a/tests/misc/join-utf8.sh
+++ b/tests/misc/join-utf8.sh
@@ -24,8 +24,6 @@ test "${LOCALE_FR_UTF8+set}" = set || skip_ "French UTF-8 locale not available"
LC_ALL=$LOCALE_FR_UTF8
export LC_ALL
-fail=0
-
vertical_line='|'
multiplication_sign='×'
en_dash='–'
@@ -44,7 +42,7 @@ do
join -t"$s" -a1 -a2 -eouch -o0,1.2,2.2 a b >out || fail=1
printf '0%sA%sB\n1%sa%sd\n2%sb%souch\n3%souch%se\n4%sc%sf\n' \
"$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" >exp ||
- framework_failure
+ framework_failure_
compare exp out || fail=1
done
diff --git a/tests/misc/join.pl b/tests/misc/join.pl
index 1d01a3d3d..4eafc273d 100755
--- a/tests/misc/join.pl
+++ b/tests/misc/join.pl
@@ -25,7 +25,7 @@ my $limits = getlimits ();
my $prog = 'join';
-my $try = "Try \`$prog --help' for more information.\n";
+my $try = "Try '$prog --help' for more information.\n";
my $inval = "$prog: invalid byte, character or field list\n$try";
my $mb_locale;
--
2.39.2