When mbrtowc returns -2, all n bytes have been processed. Bionic's
interpretation of POSIX is that you must not re-supply those bytes
on the next call, and should only supply the bytes needed to complete
the character. If you re-supply the bytes on the next call, bionic
considers that an illegal sequence and returns -1.

With these changes, the tests still pass on glibc and also pass on bionic.
---
 tests/wc.test   |  7 +------
 toys/posix/wc.c | 34 ++++++++++++++--------------------
 2 files changed, 15 insertions(+), 26 deletions(-)
From 9f9532b4c77d7c8f579a151c8bbb0fdd52d83f0f Mon Sep 17 00:00:00 2001
From: Elliott Hughes <e...@google.com>
Date: Fri, 4 Aug 2017 17:41:23 -0700
Subject: [PATCH] Fix wc -m on bionic.

When mbrtowc returns -2, all n bytes have been processed. Bionic's
interpretation of POSIX is that you must not re-supply those bytes
on the next call, and should only supply the bytes needed to complete
the character. If you re-supply the bytes on the next call, bionic
considers that an illegal sequence and returns -1.

With these changes, the tests still pass on glibc and also pass on bionic.
---
 tests/wc.test   |  7 +------
 toys/posix/wc.c | 34 ++++++++++++++--------------------
 2 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/tests/wc.test b/tests/wc.test
index 012bd53..a7fcc9a 100755
--- a/tests/wc.test
+++ b/tests/wc.test
@@ -31,12 +31,7 @@ then
     echo -n "ü" >> file1
   done
   testing "-m" "wc -m file1" "8193 file1\n" "" ""
-  testing "-m 2" 'cat "$FILES/utf8/test2.txt" | wc -m' "169\n" "" ""
-  echo -n " " > file1
-  for i in $(seq 1 8192)
-  do
-    echo -n "ü" >> file1
-  done
+  testing "-m stdin" 'cat "$FILES/utf8/test2.txt" | wc -m' "169\n" "" ""
   testing "-m (invalid chars)" "wc -m file1" "8193 file1\n" "" ""
   NOSPACE=1 testing "-mlw" "wc -mlw input" " 1 2 11 input\n" "hello, 世界!\n" ""
 
diff --git a/toys/posix/wc.c b/toys/posix/wc.c
index a8c3e45..3f6f4df 100644
--- a/toys/posix/wc.c
+++ b/toys/posix/wc.c
@@ -49,7 +49,6 @@ static void show_lengths(unsigned long *lengths, char *name)
 
 static void do_wc(int fd, char *name)
 {
-  int len = 0, clen = 1, space = 0;
   unsigned long word = 0, lengths[] = {0,0,0,0};
 
   // Speed up common case: wc -c normalfile is file length.
@@ -64,28 +63,26 @@ static void do_wc(int fd, char *name)
   }
 
   for (;;) {
-    int pos, done = 0, len2 = read(fd, toybuf+len, sizeof(toybuf)-len);
+    int pos, len = read(fd, toybuf, sizeof(toybuf));
 
-    if (len2<0) perror_msg_raw(name);
-    else len += len2;
-    if (len2<1) done++;
+    if (len<0) perror_msg_raw(name);
+    if (len<1) break;
 
     for (pos = 0; pos<len; pos++) {
+      int space;
+
       if (toybuf[pos]=='\n') lengths[0]++;
       lengths[2]++;
+
       if (toys.optflags&FLAG_m) {
-        // If we've consumed next wide char
-        if (--clen<1) {
-          wchar_t wchar;
-
-          // next wide size, don't count invalid, fetch more data if necessary
-          clen = mbrtowc(&wchar, toybuf+pos, len-pos, 0);
-          if (clen == -1) continue;
-          if (clen == -2 && !done) break;
-
-          lengths[3]++;
-          space = iswspace(wchar);
-        }
+        wchar_t wchar;
+        int clen = mbrtowc(&wchar, toybuf+pos, len-pos, 0);
+
+        if (clen == -1) continue; // Don't count invalid.
+        if (clen == -2) break; // Fetch more data to complete sequence.
+
+        lengths[3]++;
+        space = iswspace(wchar);
       } else space = isspace(toybuf[pos]);
 
       if (space) word=0;
@@ -94,9 +91,6 @@ static void do_wc(int fd, char *name)
         word=1;
       }
     }
-    if (done) break;
-    if (pos != len) memmove(toybuf, toybuf+pos, len-pos);
-    len -= pos;
   }
 
 show:
-- 
2.14.0.rc1.383.gd1ce394fe2-goog

_______________________________________________
Toybox mailing list
Toybox@lists.landley.net
http://lists.landley.net/listinfo.cgi/toybox-landley.net

Reply via email to