When mbrtowc returns -2, all n bytes have been processed. Bionic's interpretation of POSIX is that you must not re-supply those bytes on the next call, and should only supply the bytes needed to complete the character. If you re-supply the bytes on the next call, bionic considers that an illegal sequence and returns -1.
With these changes, the tests still pass on glibc and also pass on bionic. --- tests/wc.test | 7 +------ toys/posix/wc.c | 34 ++++++++++++++-------------------- 2 files changed, 15 insertions(+), 26 deletions(-)
From 9f9532b4c77d7c8f579a151c8bbb0fdd52d83f0f Mon Sep 17 00:00:00 2001 From: Elliott Hughes <e...@google.com> Date: Fri, 4 Aug 2017 17:41:23 -0700 Subject: [PATCH] Fix wc -m on bionic. When mbrtowc returns -2, all n bytes have been processed. Bionic's interpretation of POSIX is that you must not re-supply those bytes on the next call, and should only supply the bytes needed to complete the character. If you re-supply the bytes on the next call, bionic considers that an illegal sequence and returns -1. With these changes, the tests still pass on glibc and also pass on bionic. --- tests/wc.test | 7 +------ toys/posix/wc.c | 34 ++++++++++++++-------------------- 2 files changed, 15 insertions(+), 26 deletions(-) diff --git a/tests/wc.test b/tests/wc.test index 012bd53..a7fcc9a 100755 --- a/tests/wc.test +++ b/tests/wc.test @@ -31,12 +31,7 @@ then echo -n "ü" >> file1 done testing "-m" "wc -m file1" "8193 file1\n" "" "" - testing "-m 2" 'cat "$FILES/utf8/test2.txt" | wc -m' "169\n" "" "" - echo -n " " > file1 - for i in $(seq 1 8192) - do - echo -n "ü" >> file1 - done + testing "-m stdin" 'cat "$FILES/utf8/test2.txt" | wc -m' "169\n" "" "" testing "-m (invalid chars)" "wc -m file1" "8193 file1\n" "" "" NOSPACE=1 testing "-mlw" "wc -mlw input" " 1 2 11 input\n" "hello, 世界!\n" "" diff --git a/toys/posix/wc.c b/toys/posix/wc.c index a8c3e45..3f6f4df 100644 --- a/toys/posix/wc.c +++ b/toys/posix/wc.c @@ -49,7 +49,6 @@ static void show_lengths(unsigned long *lengths, char *name) static void do_wc(int fd, char *name) { - int len = 0, clen = 1, space = 0; unsigned long word = 0, lengths[] = {0,0,0,0}; // Speed up common case: wc -c normalfile is file length. @@ -64,28 +63,26 @@ static void do_wc(int fd, char *name) } for (;;) { - int pos, done = 0, len2 = read(fd, toybuf+len, sizeof(toybuf)-len); + int pos, len = read(fd, toybuf, sizeof(toybuf)); - if (len2<0) perror_msg_raw(name); - else len += len2; - if (len2<1) done++; + if (len<0) perror_msg_raw(name); + if (len<1) break; for (pos = 0; pos<len; pos++) { + int space; + if (toybuf[pos]=='\n') lengths[0]++; lengths[2]++; + if (toys.optflags&FLAG_m) { - // If we've consumed next wide char - if (--clen<1) { - wchar_t wchar; - - // next wide size, don't count invalid, fetch more data if necessary - clen = mbrtowc(&wchar, toybuf+pos, len-pos, 0); - if (clen == -1) continue; - if (clen == -2 && !done) break; - - lengths[3]++; - space = iswspace(wchar); - } + wchar_t wchar; + int clen = mbrtowc(&wchar, toybuf+pos, len-pos, 0); + + if (clen == -1) continue; // Don't count invalid. + if (clen == -2) break; // Fetch more data to complete sequence. + + lengths[3]++; + space = iswspace(wchar); } else space = isspace(toybuf[pos]); if (space) word=0; @@ -94,9 +91,6 @@ static void do_wc(int fd, char *name) word=1; } } - if (done) break; - if (pos != len) memmove(toybuf, toybuf+pos, len-pos); - len -= pos; } show: -- 2.14.0.rc1.383.gd1ce394fe2-goog
_______________________________________________ Toybox mailing list Toybox@lists.landley.net http://lists.landley.net/listinfo.cgi/toybox-landley.net