I fully agree with your reasoning and also prefer this one over the
previous two diff.

OK martijn@

On Sun, 2022-01-09 at 00:45 +0100, Ingo Schwarze wrote:
> Hi,
> 
> Martijn van Duren wrote on Sat, Jan 08, 2022 at 08:30:20AM +0100:
> 
> > Why not go for the following diff?
> > It has a comparable speed increase, but without the added complexity
> > of a second inner loop.
> 
> Actually, i like the idea of not duplicating the loop, in cases where
> that is possible without a noticeable performance cost.
> 
> I admit i OK'ed the original UTF-8 diff almost six years ago,
> but looking at the code again, i now dislike how it is testing
> every byte twice for no apparent reason.  Even worse, i regard
> the lack of I/O error checking on write operations as a bug.
> Note that it does make sense to proceed to the next file on *read*
> errors, whereas a *write* error should better be fatal.
> 
> For those reason, and because i prefer versions of isu8cont()
> that do not inspect the locale, i propose the following patch
> instead.
> 
> As an aside, i tried using fwrite(3) instead of the manual
> putchar(*u) loop, but that is almost exactly as slow as repeatedly
> calling MB_CUR_MAX, probably due to either locking or orientation
> setting or some aspect of iov handling or buffering or more than
> one of those; i refrained from profiling it.
> 
>  ----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
> 
> Index: rev.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/rev/rev.c,v
> retrieving revision 1.13
> diff -u -p -r1.13 rev.c
> --- rev.c     10 Apr 2016 17:06:52 -0000      1.13
> +++ rev.c     8 Jan 2022 23:19:46 -0000
> @@ -46,13 +46,14 @@ void usage(void);
>  int
>  main(int argc, char *argv[])
>  {
> -     char *filename, *p = NULL, *t, *u;
> +     char *filename, *p = NULL, *t, *te, *u;
>       FILE *fp;
>       ssize_t len;
>       size_t ps = 0;
> -     int ch, rval;
> +     int ch, multibyte, rval;
>  
>       setlocale(LC_CTYPE, "");
> +     multibyte = MB_CUR_MAX > 1;
>  
>       if (pledge("stdio rpath", NULL) == -1)
>               err(1, "pledge");
> @@ -83,14 +84,16 @@ main(int argc, char *argv[])
>                       if (p[len - 1] == '\n')
>                               --len;
>                       for (t = p + len - 1; t >= p; --t) {
> -                             if (isu8cont(*t))
> -                                     continue;
> -                             u = t;
> -                             do {
> -                                     putchar(*u);
> -                             } while (isu8cont(*(++u)));
> +                             te = t;
> +                             if (multibyte)
> +                                     while (t > p && isu8cont(*t))
> +                                             --t;
> +                             for (u = t; u <= te; ++u)
> +                                     if (putchar(*u) == EOF)
> +                                             err(1, "stdout");
>                       }
> -                     putchar('\n');
> +                     if (putchar('\n') == EOF)
> +                             err(1, "stdout");
>               }
>               if (ferror(fp)) {
>                       warn("%s", filename);
> @@ -104,7 +107,7 @@ main(int argc, char *argv[])
>  int
>  isu8cont(unsigned char c)
>  {
> -     return MB_CUR_MAX > 1 && (c & (0x80 | 0x40)) == 0x80;
> +     return (c & (0x80 | 0x40)) == 0x80;
>  }
>  
>  void
> 
>  ----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
> 
> The rest of this mail contains test reports.
> These test reports use two test programs:
> 
>  ----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
> 
> /* makeutf8.c */
> 
> #include <err.h>
> #include <errno.h>
> #include <locale.h>
> #include <stdio.h>
> #include <wchar.h>
> 
> int
> main(void)
> {
>       FILE            *in, *out;
>       char            *line = NULL;
>       size_t           linesize = 0;
>       unsigned int     wc, wcl = 0;
> 
>       if (setlocale(LC_CTYPE, "en_US.UTF-8") == NULL)
>               err(1, "setlocale");
> 
>       if ((in = fopen("/usr/src/gnu/usr.bin/perl/lib/unicore/"
>           "UnicodeData.txt", "r")) == NULL)
>               err(1, "in");
>       if ((out = fopen("utf8.txt", "w")) == NULL)
>               err(1, "out");
> 
>       while (getline(&line, &linesize, in) != -1) {
>               if (sscanf(line, "%x;", &wc) < 1)
>                       err(1, "scanf: %s", line);
>               if (wc > wcl + 1)
>                       if (fputwc(L'\n', out) == WEOF)
>                               err(1, "write EOL");
>               if (fputwc(wc, out) == WEOF) {
>                       if (errno == EILSEQ)
>                               warn("write U+%04x", wc);
>                       else
>                               err(1, "write U+%04x", wc);
>               }
>               wcl = wc;
>       }
>       if (ferror(in))
>               errx(1, "read error");
>       if (fputwc(L'\n', out) == WEOF)
>               err(1, "write final EOL");
> 
>       fclose(out);
>       fclose(in);
>       return 0;
> }
> 
>  ----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
> 
> /* wrap.c */
> 
> #include <unistd.h>
> 
> int
> main(int argc, char *argv[])
> {
>       close(STDOUT_FILENO);
>       execl(argv[1], argv[1], "/usr/share/dict/words", NULL);
>       return 42;
> }
> 
>  ----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----
> 
> # Prepare for testing.
> 
>    $ make makeutf8
>   cc -O2 -pipe  -Werror-implicit-function-declaration -MD -MP \
>      -o makeutf8 /usr/src/usr.bin/rev/makeutf8.c 
>    $ make wrap
>   cc -O2 -pipe  -Werror-implicit-function-declaration -MD -MP \
>      -o wrap /usr/src/usr.bin/rev/wrap.c 
>     $ make
>   cc -O2 -pipe  -Werror-implicit-function-declaration -MD -MP \
>      -c /usr/src/usr.bin/rev/rev.c
>   cc   -o rev rev.o 
>    $ ./obj/makeutf8
>   makeutf8: write U+d800: Illegal byte sequence
>   makeutf8: write U+db7f: Illegal byte sequence
>   makeutf8: write U+db80: Illegal byte sequence
>   makeutf8: write U+dbff: Illegal byte sequence
>   makeutf8: write U+dc00: Illegal byte sequence
>   makeutf8: write U+dfff: Illegal byte sequence
>    $ wc utf8.txt
>      695     693  116982 utf8.txt
> 
> # Check that the current code works properly.
> 
>    $ unset LC_ALL
>    $ export LC_CTYPE=C
>    $ /oldbin/rev /usr/share/dict/words > words_ocr.txt
>    $ /oldbin/rev words_ocr.txt > words_ocrr.txt
>    $ /oldbin/rev utf8.txt > utf8_ocr.txt
>    $ /oldbin/rev utf8_ocr.txt > utf8_ocrr.txt
>    $ export LC_CTYPE=en_US.UTF-8
>    $ /oldbin/rev /usr/share/dict/words > words_our.txt
>    $ /oldbin/rev words_our.txt > words_ourr.txt
>    $ /oldbin/rev utf8.txt > utf8_our.txt
>    $ /oldbin/rev utf8_our.txt > utf8_ourr.txt
>    $ diff -u /usr/share/dict/words words_ocrr.txt
>    $ diff -u /usr/share/dict/words words_ourr.txt
>    $ diff -u words_ocr.txt words_our.txt
>    $ diff -u utf8.txt utf8_ocrr.txt
>    $ diff -u utf8.txt utf8_ourr.txt
>    $ cmp utf8_ocr.txt utf8_our.txt                                
>   utf8_ocr.txt utf8_our.txt differ: char 12, line 2
> 
> # Check that the new code works properly.
> 
>    $ export LC_CTYPE=C
>    $ ./obj/rev /usr/share/dict/words > words_ncr.txt
>    $ ./obj/rev utf8.txt > utf8_ncr.txt
>    $ export LC_CTYPE=en_US.UTF-8
>    $ ./obj/rev /usr/share/dict/words > words_nur.txt
>    $ ./obj/rev utf8.txt > utf8_nur.txt
>    $ diff -u words_ocr.txt words_ncr.txt                          
>    $ diff -u words_our.txt words_nur.txt  
>    $ diff -u utf8_ocr.txt utf8_ncr.txt    
>    $ diff -u utf8_our.txt utf8_nur.txt  
>    $
> 
> # Demonstate broken error handling in the old code.
> 
>    $ ./obj/wrap /obin/rev               
>    $ echo $?              
>   42
> 
> # Demonstrate working error handling in the new code.
> 
>    $ ./obj/wrap ./obj/rev               
>   rev: stdout: Bad file descriptor
>    $ echo $?              
>   1
> 
> # Check performance.
> 
>    $ export LC_CTYPE=C
>    $ for i in $(jot 10); do time /oldbin/rev words_ocr.txt > /dev/null; done
>     0m00.09s real     0m00.10s user    0m00.01s system
>     0m00.09s real     0m00.10s user     0m00.00s system
>     0m00.09s real     0m00.10s user     0m00.01s system
>     0m00.09s real     0m00.10s user     0m00.00s system
>     0m00.09s real     0m00.11s user     0m00.00s system
>     0m00.09s real     0m00.10s user     0m00.00s system
>     0m00.09s real     0m00.09s user     0m00.01s system
>     0m00.09s real     0m00.09s user     0m00.01s system
>     0m00.09s real     0m00.08s user     0m00.01s system
>     0m00.09s real     0m00.09s user     0m00.00s system
>    $ for i in $(jot 10); do time ./obj/rev words_ocr.txt > /dev/null; done
>     0m00.02s real     0m00.03s user     0m00.00s system
>     0m00.02s real     0m00.03s user     0m00.00s system
>     0m00.02s real     0m00.02s user     0m00.00s system
>     0m00.02s real     0m00.02s user     0m00.00s system
>     0m00.02s real     0m00.02s user     0m00.02s system
>     0m00.02s real     0m00.03s user     0m00.00s system
>     0m00.02s real     0m00.02s user     0m00.00s system
>     0m00.02s real     0m00.02s user     0m00.00s system
>     0m00.02s real     0m00.03s user     0m00.00s system
>     0m00.02s real     0m00.03s user     0m00.00s system
>    $ export LC_CTYPE=en_US.UTF-8
>    $ for i in $(jot 10); do time /oldbin/rev words_ocr.txt > /dev/null; done
>     0m00.09s real     0m00.10s user     0m00.00s system
>     0m00.10s real     0m00.12s user     0m00.00s system
>     0m00.09s real     0m00.10s user     0m00.00s system
>     0m00.09s real     0m00.10s user     0m00.00s system
>     0m00.09s real     0m00.09s user     0m00.01s system
>     0m00.09s real     0m00.10s user     0m00.00s system
>     0m00.09s real     0m00.10s user     0m00.00s system
>     0m00.09s real     0m00.08s user     0m00.01s system
>     0m00.09s real     0m00.09s user     0m00.00s system
>     0m00.09s real     0m00.09s user     0m00.00s system
>    $ for i in $(jot 10); do time ./obj/rev words_ocr.txt > /dev/null; done
>     0m00.03s real     0m00.02s user     0m00.01s system
>     0m00.03s real     0m00.02s user     0m00.00s system
>     0m00.03s real     0m00.03s user     0m00.00s system
>     0m00.03s real     0m00.03s user     0m00.00s system
>     0m00.03s real     0m00.03s user     0m00.00s system
>     0m00.03s real     0m00.03s user     0m00.00s system
>     0m00.03s real     0m00.02s user     0m00.01s system
>     0m00.03s real     0m00.03s user     0m00.00s system
>     0m00.03s real     0m00.02s user     0m00.01s system
>     0m00.03s real     0m00.03s user     0m00.00s system
> 
> # Regression tests.
> 
>    $ doas make install
>    $ cd /usr/src/regress/usr.bin/rev/
>    $ make clean
>   rm -f a.out [Ee]rrs mklog *.core y.tab.h       out.ascii.txt out.utf8.txt  
>    $ make regress ; echo $?
>   0
> 

Reply via email to