On 17/10/2025 13:35, Pádraig Brady wrote:
On 16/10/2025 22:47, Collin Funk wrote:
Hi Pádraig,
Pádraig Brady <[email protected]> writes:
+ /* Skip a single blank or NBSP between the number and suffix. */
+ mcel_t g = mcel_scanz (*endptr);
+ if (c32isblank (g.ch) || c32isnbspace (g.ch))
+ (*endptr) += g.len;
Looks good.
if (**endptr == '\0')
break; /* Treat as no suffix. */
if (!valid_suffix (**endptr))
- return SSE_INVALID_SUFFIX;
+ {
+ /* Trailing blanks are allowed. */
+ while (isblank (to_uchar (**endptr)))
+ (*endptr)++;
+ if (**endptr == '\0')
+ break;
+
+ return SSE_INVALID_SUFFIX;
+ }
Any reason not to do the same here? This one only handles ' ' and '\t'.
Oh right, good call. I'll squash this in:
@@ -681,8 +686,7 @@ simple_strtod_human (char const *input_str,
if (!valid_suffix (**endptr))
{
/* Trailing blanks are allowed. */
- while (isblank (to_uchar (**endptr)))
- (*endptr)++;
+ *endptr = skip_str_matching (*endptr, newline_or_blank, true);
if (**endptr == '\0')
break;
I'll also add another commit to remove isblank() from
process_suffixed_number(), and add a multi-byte blank test.
There were various other multi-byte blanks issues,
and multi-byte issues in general when I looked further.
The attached 3 further patches should make numfmt fully support multi-byte.
cheers,
Padraig
From 770078e315232b49c0e113152a469df4df1e5f4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Fri, 17 Oct 2025 19:14:21 +0100
Subject: [PATCH 1/3] numfmt: fix issues with multi-byte blanks
* src/numfmt.c (process_line): Restore byte overwritten with NUL,
as it may be part of a multi-byte blank.
(process_suffixed_number): Skip multi-byte blanks,
and correctly determine width with mbswidth().
(parse_format_string): Use c_isblank() to explicitly
indicate that's all the format spec supports.
* tests/misc/numfmt.pl: Add test cases.
* NEWS: Mention the bug fix.
---
NEWS | 3 +++
src/numfmt.c | 28 ++++++++++++++++++++--------
tests/misc/numfmt.pl | 14 ++++++++++++++
3 files changed, 37 insertions(+), 8 deletions(-)
diff --git a/NEWS b/NEWS
index b34513271..f80363f87 100644
--- a/NEWS
+++ b/NEWS
@@ -21,6 +21,9 @@ GNU coreutils NEWS -*- outline -*-
'numfmt' no longer reads out-of-bounds memory with trailing blanks in input.
[bug introduced with numfmt in coreutils-8.21]
+ 'numfmt' no longer outputs invalid characters with multi-byte blanks in input.
+ [bug introduced in coreutils-9.5]
+
'rm -d DIR' no longer fails on Ceph snapshot directories.
Although these directories are nonempty, 'rmdir DIR' succeeds on them.
[bug introduced in coreutils-8.16]
diff --git a/src/numfmt.c b/src/numfmt.c
index 26f918054..67458558a 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -1150,7 +1150,7 @@ parse_format_string (char const *fmt)
errno = 0;
user_precision = strtol (fmt + i, &endptr, 10);
if (errno == ERANGE || user_precision < 0 || SIZE_MAX < user_precision
- || isblank (fmt[i]) || fmt[i] == '+')
+ || c_isblank (fmt[i]) || fmt[i] == '+')
{
/* Note we disallow negative user_precision to be
consistent with printf(1). POSIX states that
@@ -1340,15 +1340,18 @@ process_suffixed_number (char *text, long double *result,
devmsg ("no valid suffix found\n");
}
- /* Skip white space - always. */
- char *p = text;
- while (*p && isblank (to_uchar (*p)))
- ++p;
+ /* Skip blanks - always. */
+ char *p = skip_str_matching (text, newline_or_blank, true);
/* setup auto-padding. */
if (auto_padding)
{
- padding_width = text < p || 1 < field ? strlen (text) : 0;
+ padding_width = text < p || 1 < field
+ ? mbswidth (text,
+ MBSW_REJECT_INVALID | MBSW_REJECT_UNPRINTABLE)
+ : 0;
+ if (padding_width < 0)
+ padding_width = strlen (text);
devmsg ("setting Auto-Padding to %jd characters\n", padding_width);
}
@@ -1455,7 +1458,8 @@ process_line (char *line, bool newline)
if (*line != '\0')
{
- /* nul terminate the current field string and process */
+ /* NUL terminate the current field string and process */
+ char end_field = *line;
*line = '\0';
if (! process_field (next, field))
@@ -1463,7 +1467,15 @@ process_line (char *line, bool newline)
fputc ((delimiter == DELIMITER_DEFAULT) ?
' ' : delimiter, stdout);
- ++line;
+
+ if (delimiter != DELIMITER_DEFAULT)
+ line++;
+ else
+ {
+ *line = end_field;
+ mcel_t g = mcel_scanz (line);
+ line += g.len;
+ }
}
else
{
diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl
index ff22c7303..2f03efd1c 100755
--- a/tests/misc/numfmt.pl
+++ b/tests/misc/numfmt.pl
@@ -1172,6 +1172,20 @@ my @Locale_Tests =
{ENV=>"LC_ALL=$locale"}],
['lcl-suf-6', "--from=auto '2\xe2\x81\xa0Ki'", {OUT => "2048"},
{ENV=>"LC_ALL=$locale"}],
+ # multi-byte blank char (em space, \u2003)
+ # Ensure trailing multi-byte blanks skipped
+ ['lcl-suf-7', "'2\xe2\x80\x83 '", {OUT => "2 "},
+ {ENV=>"LC_ALL=$locale"}],
+ ['lcl-suf-8', "-d '' --from=auto '2Ki\xe2\x80\x83 '", {OUT => "2048"},
+ {ENV=>"LC_ALL=$locale"}],
+ # Ensure multi-byte blank field separators not corrupted
+ ['lcl-suf-9', "--field=1 '1\xe2\x80\x832'", {OUT => "1 2"},
+ {ENV=>"LC_ALL=$locale"}],
+ ['lcl-suf-10', "--field=2 '1\xe2\x80\x832'", {OUT => "1 2"},
+ {ENV=>"LC_ALL=$locale"}],
+ # Ensure multi-byte blank field separators width determined correctly
+ ['lcl-suf-11', "--field=2 '1 \xe2\x80\x832'",
+ {OUT => "1 2"}, {ENV=>"LC_ALL=$locale"}],
);
if ($locale ne 'C')
--
2.51.0
From e5d7764386c173a37106002c6bc800866d4125e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Sat, 18 Oct 2025 13:02:44 +0100
Subject: [PATCH 2/3] numfmt: use multi-byte aware suffix matching
* src/numfmt.c (process_suffixed_number): Use gnulib's
mbs_endswith() helper, which is more robust in non UTF-8 locales.
Also always output a devmsg if a suffix is specified.
---
bootstrap.conf | 1 +
src/numfmt.c | 9 +++------
2 files changed, 4 insertions(+), 6 deletions(-)
diff --git a/bootstrap.conf b/bootstrap.conf
index f470aa48b..8f9194341 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -173,6 +173,7 @@ gnulib_modules="
mbrlen
mbrtoc32
mbrtowc
+ mbs_endswith
mbschr
mbslen
mbswidth
diff --git a/src/numfmt.c b/src/numfmt.c
index 67458558a..cc80ccc5d 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -1326,14 +1326,11 @@ static int
process_suffixed_number (char *text, long double *result,
size_t *precision, long int field)
{
- if (suffix && strlen (text) > strlen (suffix))
+ if (suffix)
{
- char *possible_suffix = text + strlen (text) - strlen (suffix);
-
- if (streq (suffix, possible_suffix))
+ if (mbs_endswith (text, suffix))
{
- /* trim suffix, ONLY if it's at the end of the text. */
- *possible_suffix = '\0';
+ *(text + strlen (text) - strlen (suffix)) = '\0';
devmsg ("trimming suffix %s\n", quote (suffix));
}
else
--
2.51.0
From ff0f245fc108b99dde3bd671504ef74976179d5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Sat, 18 Oct 2025 17:44:49 +0100
Subject: [PATCH 3/3] numfmt: support multi-byte --delimiter
* bootstrap.conf: Depend on mbsstr() to robustly search for a
multi-byte delimiter character (string) within a multi-byte string.
* src/numfmt.c (main): Accept a valid multi-byte delimiter character.
(next_field): Adjust delimiter search from single byte
to multi-byte aware. Use mbsstr to find the first match.
* tests/misc/numfmt.pl: Add test case.
* NEWS: Mention the improvement.
---
NEWS | 3 ++-
bootstrap.conf | 1 +
src/numfmt.c | 46 +++++++++++++++++++++++---------------------
tests/misc/numfmt.pl | 7 +++++++
4 files changed, 34 insertions(+), 23 deletions(-)
diff --git a/NEWS b/NEWS
index f80363f87..c5b94d792 100644
--- a/NEWS
+++ b/NEWS
@@ -44,7 +44,8 @@ GNU coreutils NEWS -*- outline -*-
** Improvements
numfmt now parses numbers with a non-breaking space character before a unit,
- and numbers containing grouping characters from the current locale.
+ and parses numbers containing grouping characters from the current locale.
+ It also supports a multi-byte --delimeter character.
wc -l now operates 10% faster on hosts that support AVX512 instructions.
diff --git a/bootstrap.conf b/bootstrap.conf
index 8f9194341..5125d6697 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -176,6 +176,7 @@ gnulib_modules="
mbs_endswith
mbschr
mbslen
+ mbsstr
mbswidth
mbszero
mcel-prefer
diff --git a/src/numfmt.c b/src/numfmt.c
index cc80ccc5d..0f0a8770b 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -156,9 +156,6 @@ static struct option const longopts[] =
{nullptr, 0, nullptr, 0}
};
-/* If delimiter has this value, blanks separate fields. */
-enum { DELIMITER_DEFAULT = CHAR_MAX + 1 };
-
/* Maximum number of digits we can safely handle
without precision loss, if scaling is 'none'. */
enum { MAX_UNSCALED_DIGITS = LDBL_DIG };
@@ -194,8 +191,8 @@ static int conv_exit_code = EXIT_CONVERSION_WARNINGS;
/* auto-pad each line based on skipped whitespace. */
static int auto_padding = 0;
-/* field delimiter */
-static int delimiter = DELIMITER_DEFAULT;
+/* field delimiter - if nullptr, blanks separate fields. */
+static char const *delimiter = nullptr;
/* line delimiter. */
static unsigned char line_delim = '\n';
@@ -1374,14 +1371,10 @@ next_field (char **line)
char *field_start = *line;
char *field_end = field_start;
- if (delimiter != DELIMITER_DEFAULT)
+ if (delimiter)
{
- if (*field_start != delimiter)
- {
- while (*field_end && *field_end != delimiter)
- ++field_end;
- }
- /* else empty field */
+ if (! *delimiter || ! (field_end = mbsstr (field_start, delimiter)))
+ field_end = strchr (field_start, '\0');
}
else
{
@@ -1462,11 +1455,13 @@ process_line (char *line, bool newline)
if (! process_field (next, field))
valid_number = false;
- fputc ((delimiter == DELIMITER_DEFAULT) ?
- ' ' : delimiter, stdout);
+ if (delimiter != nullptr)
+ fputs (delimiter, stdout);
+ else
+ fputc (' ', stdout);
- if (delimiter != DELIMITER_DEFAULT)
- line++;
+ if (delimiter)
+ line += MAX (strlen (delimiter), 1);
else
{
*line = end_field;
@@ -1573,10 +1568,17 @@ main (int argc, char **argv)
case 'd':
/* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
- if (optarg[0] != '\0' && optarg[1] != '\0')
- error (EXIT_FAILURE, 0,
- _("the delimiter must be a single character"));
- delimiter = optarg[0];
+ if (optarg[0] != '\0')
+ {
+ mcel_t g = mcel_scanz (optarg);
+ /* Note we always allow single bytes, especially since mcel
+ explicitly does not avoid https://sourceware.org/PR29511
+ I.e., we ignore g.err, and rely on g.len==1 with g.err. */
+ if (optarg[g.len] != '\0')
+ error (EXIT_FAILURE, 0,
+ _("the delimiter must be a single character"));
+ }
+ delimiter = optarg;
break;
case 'z':
@@ -1642,7 +1644,7 @@ main (int argc, char **argv)
&& !grouping && (padding_width == 0) && (format_str == nullptr))
error (0, 0, _("no conversion option specified"));
- if (debug && unit_separator && delimiter == DELIMITER_DEFAULT)
+ if (debug && unit_separator && delimiter == nullptr)
error (0, 0,
_("field delimiters have higher precedence than unit separators"));
@@ -1657,7 +1659,7 @@ main (int argc, char **argv)
error (0, 0, _("grouping has no effect in this locale"));
}
- auto_padding = (padding_width == 0 && delimiter == DELIMITER_DEFAULT);
+ auto_padding = (padding_width == 0 && delimiter == nullptr);
if (inval_style != inval_abort)
conv_exit_code = 0;
diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl
index 2f03efd1c..75de1a9f9 100755
--- a/tests/misc/numfmt.pl
+++ b/tests/misc/numfmt.pl
@@ -283,6 +283,9 @@ my @Tests =
['delim-4', '--delimiter=: --from=auto 40M:60M', {OUT=>'40000000:60M'}],
['delim-5', '-d: --field=2 --from=auto :40M:60M', {OUT=>':40000000:60M'}],
['delim-6', '-d: --field 3 --from=auto 40M:60M', {OUT=>"40M:60M"}],
+ # Ensure we don't hit https://sourceware.org/PR29511
+ ['delim-7', "-d '\xc2' --field=2 --invalid=ignore '1\xc2\xb72K'",
+ {OUT => "1\xc2\xb72K"}],
['delim-err-1', '-d,, --to=si 1', {EXIT=>1},
{ERR => "$prog: the delimiter must be a single character\n"}],
@@ -1187,6 +1190,10 @@ my @Locale_Tests =
['lcl-suf-11', "--field=2 '1 \xe2\x80\x832'",
{OUT => "1 2"}, {ENV=>"LC_ALL=$locale"}],
+ # Support multi-byte delimiter
+ ['lcl-delim-1', "-d '\xc2\xb7' --field=2 --from=auto '1\xc2\xb72K'",
+ {OUT => "1\xc2\xb72000"}, {ENV=>"LC_ALL=$locale"}],
+
);
if ($locale ne 'C')
{
--
2.51.0