I fully agree with your reasoning and also prefer this one over the
previous two diff.
OK martijn@
On Sun, 2022-01-09 at 00:45 +0100, Ingo Schwarze wrote:
> Hi,
>
> Martijn van Duren wrote on Sat, Jan 08, 2022 at 08:30:20AM +0100:
>
> > Why not go for the following diff?
> > It has a comparable speed increase, but without the added complexity
> > of a second inner loop.
>
> Actually, i like the idea of not duplicating the loop, in cases where
> that is possible without a noticeable performance cost.
>
> I admit i OK'ed the original UTF-8 diff almost six years ago,
> but looking at the code again, i now dislike how it is testing
> every byte twice for no apparent reason. Even worse, i regard
> the lack of I/O error checking on write operations as a bug.
> Note that it does make sense to proceed to the next file on *read*
> errors, whereas a *write* error should better be fatal.
>
> For those reason, and because i prefer versions of isu8cont()
> that do not inspect the locale, i propose the following patch
> instead.
>
> As an aside, i tried using fwrite(3) instead of the manual
> putchar(*u) loop, but that is almost exactly as slow as repeatedly
> calling MB_CUR_MAX, probably due to either locking or orientation
> setting or some aspect of iov handling or buffering or more than
> one of those; i refrained from profiling it.
>
> ----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
>
> Index: rev.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/rev/rev.c,v
> retrieving revision 1.13
> diff -u -p -r1.13 rev.c
> --- rev.c 10 Apr 2016 17:06:52 -0000 1.13
> +++ rev.c 8 Jan 2022 23:19:46 -0000
> @@ -46,13 +46,14 @@ void usage(void);
> int
> main(int argc, char *argv[])
> {
> - char *filename, *p = NULL, *t, *u;
> + char *filename, *p = NULL, *t, *te, *u;
> FILE *fp;
> ssize_t len;
> size_t ps = 0;
> - int ch, rval;
> + int ch, multibyte, rval;
>
> setlocale(LC_CTYPE, "");
> + multibyte = MB_CUR_MAX > 1;
>
> if (pledge("stdio rpath", NULL) == -1)
> err(1, "pledge");
> @@ -83,14 +84,16 @@ main(int argc, char *argv[])
> if (p[len - 1] == '\n')
> --len;
> for (t = p + len - 1; t >= p; --t) {
> - if (isu8cont(*t))
> - continue;
> - u = t;
> - do {
> - putchar(*u);
> - } while (isu8cont(*(++u)));
> + te = t;
> + if (multibyte)
> + while (t > p && isu8cont(*t))
> + --t;
> + for (u = t; u <= te; ++u)
> + if (putchar(*u) == EOF)
> + err(1, "stdout");
> }
> - putchar('\n');
> + if (putchar('\n') == EOF)
> + err(1, "stdout");
> }
> if (ferror(fp)) {
> warn("%s", filename);
> @@ -104,7 +107,7 @@ main(int argc, char *argv[])
> int
> isu8cont(unsigned char c)
> {
> - return MB_CUR_MAX > 1 && (c & (0x80 | 0x40)) == 0x80;
> + return (c & (0x80 | 0x40)) == 0x80;
> }
>
> void
>
> ----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
>
> The rest of this mail contains test reports.
> These test reports use two test programs:
>
> ----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
>
> /* makeutf8.c */
>
> #include <err.h>
> #include <errno.h>
> #include <locale.h>
> #include <stdio.h>
> #include <wchar.h>
>
> int
> main(void)
> {
> FILE *in, *out;
> char *line = NULL;
> size_t linesize = 0;
> unsigned int wc, wcl = 0;
>
> if (setlocale(LC_CTYPE, "en_US.UTF-8") == NULL)
> err(1, "setlocale");
>
> if ((in = fopen("/usr/src/gnu/usr.bin/perl/lib/unicore/"
> "UnicodeData.txt", "r")) == NULL)
> err(1, "in");
> if ((out = fopen("utf8.txt", "w")) == NULL)
> err(1, "out");
>
> while (getline(&line, &linesize, in) != -1) {
> if (sscanf(line, "%x;", &wc) < 1)
> err(1, "scanf: %s", line);
> if (wc > wcl + 1)
> if (fputwc(L'\n', out) == WEOF)
> err(1, "write EOL");
> if (fputwc(wc, out) == WEOF) {
> if (errno == EILSEQ)
> warn("write U+%04x", wc);
> else
> err(1, "write U+%04x", wc);
> }
> wcl = wc;
> }
> if (ferror(in))
> errx(1, "read error");
> if (fputwc(L'\n', out) == WEOF)
> err(1, "write final EOL");
>
> fclose(out);
> fclose(in);
> return 0;
> }
>
> ----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
>
> /* wrap.c */
>
> #include <unistd.h>
>
> int
> main(int argc, char *argv[])
> {
> close(STDOUT_FILENO);
> execl(argv[1], argv[1], "/usr/share/dict/words", NULL);
> return 42;
> }
>
> ----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
>
> # Prepare for testing.
>
> $ make makeutf8
> cc -O2 -pipe -Werror-implicit-function-declaration -MD -MP \
> -o makeutf8 /usr/src/usr.bin/rev/makeutf8.c
> $ make wrap
> cc -O2 -pipe -Werror-implicit-function-declaration -MD -MP \
> -o wrap /usr/src/usr.bin/rev/wrap.c
> $ make
> cc -O2 -pipe -Werror-implicit-function-declaration -MD -MP \
> -c /usr/src/usr.bin/rev/rev.c
> cc -o rev rev.o
> $ ./obj/makeutf8
> makeutf8: write U+d800: Illegal byte sequence
> makeutf8: write U+db7f: Illegal byte sequence
> makeutf8: write U+db80: Illegal byte sequence
> makeutf8: write U+dbff: Illegal byte sequence
> makeutf8: write U+dc00: Illegal byte sequence
> makeutf8: write U+dfff: Illegal byte sequence
> $ wc utf8.txt
> 695 693 116982 utf8.txt
>
> # Check that the current code works properly.
>
> $ unset LC_ALL
> $ export LC_CTYPE=C
> $ /oldbin/rev /usr/share/dict/words > words_ocr.txt
> $ /oldbin/rev words_ocr.txt > words_ocrr.txt
> $ /oldbin/rev utf8.txt > utf8_ocr.txt
> $ /oldbin/rev utf8_ocr.txt > utf8_ocrr.txt
> $ export LC_CTYPE=en_US.UTF-8
> $ /oldbin/rev /usr/share/dict/words > words_our.txt
> $ /oldbin/rev words_our.txt > words_ourr.txt
> $ /oldbin/rev utf8.txt > utf8_our.txt
> $ /oldbin/rev utf8_our.txt > utf8_ourr.txt
> $ diff -u /usr/share/dict/words words_ocrr.txt
> $ diff -u /usr/share/dict/words words_ourr.txt
> $ diff -u words_ocr.txt words_our.txt
> $ diff -u utf8.txt utf8_ocrr.txt
> $ diff -u utf8.txt utf8_ourr.txt
> $ cmp utf8_ocr.txt utf8_our.txt
> utf8_ocr.txt utf8_our.txt differ: char 12, line 2
>
> # Check that the new code works properly.
>
> $ export LC_CTYPE=C
> $ ./obj/rev /usr/share/dict/words > words_ncr.txt
> $ ./obj/rev utf8.txt > utf8_ncr.txt
> $ export LC_CTYPE=en_US.UTF-8
> $ ./obj/rev /usr/share/dict/words > words_nur.txt
> $ ./obj/rev utf8.txt > utf8_nur.txt
> $ diff -u words_ocr.txt words_ncr.txt
> $ diff -u words_our.txt words_nur.txt
> $ diff -u utf8_ocr.txt utf8_ncr.txt
> $ diff -u utf8_our.txt utf8_nur.txt
> $
>
> # Demonstate broken error handling in the old code.
>
> $ ./obj/wrap /obin/rev
> $ echo $?
> 42
>
> # Demonstrate working error handling in the new code.
>
> $ ./obj/wrap ./obj/rev
> rev: stdout: Bad file descriptor
> $ echo $?
> 1
>
> # Check performance.
>
> $ export LC_CTYPE=C
> $ for i in $(jot 10); do time /oldbin/rev words_ocr.txt > /dev/null; done
> 0m00.09s real 0m00.10s user 0m00.01s system
> 0m00.09s real 0m00.10s user 0m00.00s system
> 0m00.09s real 0m00.10s user 0m00.01s system
> 0m00.09s real 0m00.10s user 0m00.00s system
> 0m00.09s real 0m00.11s user 0m00.00s system
> 0m00.09s real 0m00.10s user 0m00.00s system
> 0m00.09s real 0m00.09s user 0m00.01s system
> 0m00.09s real 0m00.09s user 0m00.01s system
> 0m00.09s real 0m00.08s user 0m00.01s system
> 0m00.09s real 0m00.09s user 0m00.00s system
> $ for i in $(jot 10); do time ./obj/rev words_ocr.txt > /dev/null; done
> 0m00.02s real 0m00.03s user 0m00.00s system
> 0m00.02s real 0m00.03s user 0m00.00s system
> 0m00.02s real 0m00.02s user 0m00.00s system
> 0m00.02s real 0m00.02s user 0m00.00s system
> 0m00.02s real 0m00.02s user 0m00.02s system
> 0m00.02s real 0m00.03s user 0m00.00s system
> 0m00.02s real 0m00.02s user 0m00.00s system
> 0m00.02s real 0m00.02s user 0m00.00s system
> 0m00.02s real 0m00.03s user 0m00.00s system
> 0m00.02s real 0m00.03s user 0m00.00s system
> $ export LC_CTYPE=en_US.UTF-8
> $ for i in $(jot 10); do time /oldbin/rev words_ocr.txt > /dev/null; done
> 0m00.09s real 0m00.10s user 0m00.00s system
> 0m00.10s real 0m00.12s user 0m00.00s system
> 0m00.09s real 0m00.10s user 0m00.00s system
> 0m00.09s real 0m00.10s user 0m00.00s system
> 0m00.09s real 0m00.09s user 0m00.01s system
> 0m00.09s real 0m00.10s user 0m00.00s system
> 0m00.09s real 0m00.10s user 0m00.00s system
> 0m00.09s real 0m00.08s user 0m00.01s system
> 0m00.09s real 0m00.09s user 0m00.00s system
> 0m00.09s real 0m00.09s user 0m00.00s system
> $ for i in $(jot 10); do time ./obj/rev words_ocr.txt > /dev/null; done
> 0m00.03s real 0m00.02s user 0m00.01s system
> 0m00.03s real 0m00.02s user 0m00.00s system
> 0m00.03s real 0m00.03s user 0m00.00s system
> 0m00.03s real 0m00.03s user 0m00.00s system
> 0m00.03s real 0m00.03s user 0m00.00s system
> 0m00.03s real 0m00.03s user 0m00.00s system
> 0m00.03s real 0m00.02s user 0m00.01s system
> 0m00.03s real 0m00.03s user 0m00.00s system
> 0m00.03s real 0m00.02s user 0m00.01s system
> 0m00.03s real 0m00.03s user 0m00.00s system
>
> # Regression tests.
>
> $ doas make install
> $ cd /usr/src/regress/usr.bin/rev/
> $ make clean
> rm -f a.out [Ee]rrs mklog *.core y.tab.h out.ascii.txt out.utf8.txt
> $ make regress ; echo $?
> 0
>