bug#34524: wc: word count incorrect when words separated only by no-break space

Pádraig Brady Mon, 25 Feb 2019 20:28:26 -0800

On 24/02/19 19:55, Pádraig Brady wrote:
> On 24/02/19 17:07, Pádraig Brady wrote:
>> So non break space is generally considered a word delimiter,
>> though there are complications you detail from unicode.
>>
>> In regard to options for enabling various behaviors for wc(1),
>> I'm thinking we might keep the strict POSIX isspace() behavior
>> with LC_CTYPE=C and/or POSIXLY_CORRECT=1, and use iswnbspace()
>> by default, since that's the most common operation one would want,
>> and is consistent with libreoffice for example.
>> I'll adjust the patch along those lines.
> 
> Full patch attached.


Updated patch attached. I'll push in a few hours.
Marking this bug as done.

cheers,
Pádraig.

>From c04ff0df5dfe788a38162cb2609b38495e765383 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Sat, 23 Feb 2019 21:23:47 -0800
Subject: [PATCH] wc: treat non break space as a word separator

* src/wc.c (iswnbspace): A new function to match
characters in this class.
(main): Initialize posixly_correct from the environment,
to allow disabling honoring NBSP in non C locales.
(wc): Call is[w]nbspace() as well as is[w]space.
* bootstrap.conf: Ensure btowc is available.
* tests/misc/wc-nbsp.sh: A new test.
* tests/local.mk: Reference the new test.
* NEWS: Mention the change in behavior.
---
 NEWS                  |  3 +++
 bootstrap.conf        |  1 +
 src/wc.c              | 25 +++++++++++++++++++++++--
 tests/local.mk        |  1 +
 tests/misc/wc-nbsp.sh | 42 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100755 tests/misc/wc-nbsp.sh

diff --git a/NEWS b/NEWS
index e400554..9bfa3c3 100644
--- a/NEWS
+++ b/NEWS
@@ -53,6 +53,9 @@ GNU coreutils NEWS                                    -*- outline -*-
   operator, so POSIX changed this to 'test -e FILE'.  Scripts using it were
   already broken and non-portable; the -a unary operator was never documented.
 
+  wc now treats non breaking space characters as word delimiters
+  unless the POSIXLY_CORRECT environment variable is set.
+
 ** New features
 
   id now supports specifying multiple users.
diff --git a/bootstrap.conf b/bootstrap.conf
index a525ef4..4926152 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -38,6 +38,7 @@ gnulib_modules="
   backup-rename
   base32
   base64
+  btowc
   buffer-lcm
   c-strcase
   cl-strtod
diff --git a/src/wc.c b/src/wc.c
index 179abbe..2381804 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -74,6 +74,9 @@ static bool have_read_stdin;
 /* Used to determine if file size can be determined without reading.  */
 static size_t page_size;
 
+/* Enable to _not_ treat non breaking space as a word separator.  */
+static bool posixly_correct;
+
 /* The result of calling fstat or stat on a file descriptor or file.  */
 struct fstatus
 {
@@ -147,6 +150,21 @@ the following order: newline, word, character, byte, maximum line length.\n\
   exit (status);
 }
 
+/* Return non zero if a non breaking space.  */
+static int _GL_ATTRIBUTE_PURE
+iswnbspace (wint_t wc)
+{
+  return ! posixly_correct
+         && (wc == 0x00A0 || wc == 0x2007
+             || wc == 0x202F || wc == 0x2060);
+}
+
+static int
+isnbspace (int c)
+{
+  return iswnbspace (btowc (c));
+}
+
 /* FILE is the name of the file (or NULL for standard input)
    associated with the specified counters.  */
 static void
@@ -455,7 +473,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                           if (width > 0)
                             linepos += width;
                         }
-                      if (iswspace (wide_char))
+                      if (iswspace (wide_char) || iswnbspace (wide_char))
                         goto mb_word_separator;
                       in_word = true;
                     }
@@ -538,7 +556,8 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                   if (isprint (to_uchar (p[-1])))
                     {
                       linepos++;
-                      if (isspace (to_uchar (p[-1])))
+                      if (isspace (to_uchar (p[-1]))
+                          || isnbspace (to_uchar (p[-1])))
                         goto word_separator;
                       in_word = true;
                     }
@@ -681,6 +700,8 @@ main (int argc, char **argv)
      so that processes running in parallel do not intersperse their output.  */
   setvbuf (stdout, NULL, _IOLBF, 0);
 
+  posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
+
   print_lines = print_words = print_chars = print_bytes = false;
   print_linelength = false;
   total_lines = total_words = total_chars = total_bytes = max_line_length = 0;
diff --git a/tests/local.mk b/tests/local.mk
index 4751886..bacc5d2 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -272,6 +272,7 @@ all_tests =					\
   tests/misc/wc.pl				\
   tests/misc/wc-files0-from.pl			\
   tests/misc/wc-files0.sh			\
+  tests/misc/wc-nbsp.sh				\
   tests/misc/wc-parallel.sh			\
   tests/misc/wc-proc.sh				\
   tests/misc/cat-proc.sh			\
diff --git a/tests/misc/wc-nbsp.sh b/tests/misc/wc-nbsp.sh
new file mode 100755
index 0000000..11ee0d6
--- /dev/null
+++ b/tests/misc/wc-nbsp.sh
@@ -0,0 +1,42 @@
+#!/bin/sh
+# Test non breaking space handling
+
+# Copyright (C) 2019 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ wc printf
+
+# Before coreutils 8.31 nbsp was treated as part of a word,
+# rather than a word delimiter
+
+export LC_ALL=en_US.ISO-8859-1
+if test "$(locale charmap 2>/dev/null)" = ISO-8859-1; then
+  test $(env printf '=\xA0=' | wc -w) = 2 || fail=1
+  test $(env printf '=\xA0=' | POSIXLY_CORRECT=1 wc -w) = 1 || fail=1
+fi
+export LC_ALL=en_US.UTF-8
+if test "$(locale charmap 2>/dev/null)" = UTF-8; then
+  test $(env printf '=\u00A0=' | wc -w) = 2 || fail=1
+  test $(env printf '=\u2007=' | wc -w) = 2 || fail=1
+  test $(env printf '=\u202F=' | wc -w) = 2 || fail=1
+  test $(env printf '=\u2060=' | wc -w) = 2 || fail=1
+fi
+export LC_ALL=ru_RU.KOI8-R
+if test "$(locale charmap 2>/dev/null)" = KOI8-R; then
+  test $(env printf '=\x9A=' | wc -w) = 2 || fail=1
+fi
+
+Exit $fail
-- 
2.9.3

bug#34524: wc: word count incorrect when words separated only by no-break space

Reply via email to