less(1) UTF-8 cleanup: pappend()

Ingo Schwarze Wed, 13 Mar 2019 10:15:54 -0700

Hi Todd,

Todd C. Miller wrote on Tue, Mar 12, 2019 at 09:14:43AM -0600:


> This looks like an improvement to me.  OK millert@

Thanks for checking the do_append() patch, i committed it.

The next step is to clean up pappend(), which reads one byte of
a multibyte character per call and outputs the multibyte character
once it is complete.  As long as the character is not yet complete,
the bytes are cached in the static buffer mbc_buf; the current
number of bytes already cached is mbc_buf_index, the byte position
of the start byte is mbc_pos.

The static variable mbc_buf_len stores the number of bytes expected
in the current character according to the start byte.  When using
standard functions, this information is irrelevant, so i'm deleting
the variable, which simplifies global state.

The proper tool to distinguish valid, incomplete, and invalid
multibyte characters is mbrtowc(3).  The simpler and generally more
recommended function mbtowc(3) which i used in earlier patches
cannot be used for this purpose because it does not distinguish
incomplete from invalid sequences.  The single call to mbrtowc(3)
replaces one call of each of the following functions and macros
that we want to get rid of: IS_ASCII_OCTET(), IS_UTF8_LEAD(),
utf_len(), IS_UTF8_TRAIL(), and is_utf8_well_formed().

Some additional observations:
 * A goto is replaced by a proper loop.
 * In case of an invalid start byte, calling store_prchar() directly
   is simpler than the detour via flush_mbc_buf().
 * When the line overflows, rather than first storing the number of
   bytes to back up in mbc_buf_index, then moving it to r at the
   end of the function, it is simpler to store it into r right away.
 * The patch replaces 11 lines of code with 10 lines of comments.  :-)

Tests done:
 * ASCII CR bytes are correctly shown encoded as ^M after ASCII bytes,
   after valid UTF-8 characters, and when interrupting incomplete
   UTF-8 characters.
 * Incomplete UTF-8 characters are correctly shown in encoded form
   when interrupted by ASCII characters, by valid UTF-8 characters,
   or by invalid bytes.
 * Width 2 Unicode characters correctly wrap to the next line when
   their left half would be in the last column of the screen.
 * Encoded representations of invalid bytes and of valid, but
   non-printable UTF-8 characters correctly wrap to the next line
   in their entirety unless all their output bytes still fit on the
   current screen output line.
 * All tests were also performed with LC_CTYPE=C.

OK?
  Ingo


Index: line.c
===================================================================
RCS file: /cvs/src/usr.bin/less/line.c,v
retrieving revision 1.28
diff -u -p -r1.28 line.c
--- line.c      13 Mar 2019 11:22:56 -0000      1.28
+++ line.c      13 Mar 2019 16:07:35 -0000
@@ -64,7 +64,6 @@ extern off_t start_attnpos;
 extern off_t end_attnpos;
 
 static char mbc_buf[MAX_UTF_CHAR_LEN];
-static int mbc_buf_len = 0;
 static int mbc_buf_index = 0;
 static off_t mbc_pos;
 
@@ -129,7 +128,6 @@ prewind(void)
        column = 0;
        cshift = 0;
        overstrike = 0;
-       mbc_buf_len = 0;
        is_null_line = 0;
        pendc = '\0';
        lmargin = 0;
@@ -683,6 +681,9 @@ flush_mbc_buf(off_t pos)
 int
 pappend(char c, off_t pos)
 {
+       mbstate_t mbs;
+       size_t sz;
+       wchar_t ch;
        int r;
 
        if (pendc) {
@@ -696,13 +697,12 @@ pappend(char c, off_t pos)
        }
 
        if (c == '\r' && bs_mode == BS_SPECIAL) {
-               if (mbc_buf_len > 0)  /* utf_mode must be on. */ {
+               if (mbc_buf_index > 0)  /* utf_mode must be on. */ {
                        /* Flush incomplete (truncated) sequence. */
                        r = flush_mbc_buf(mbc_pos);
-                       mbc_buf_index = r + 1;
-                       mbc_buf_len = 0;
+                       mbc_buf_index = 0;
                        if (r)
-                               return (mbc_buf_index);
+                               return (r + 1);
                }
 
                /*
@@ -718,40 +718,43 @@ pappend(char c, off_t pos)
        if (!utf_mode) {
                r = do_append((LWCHAR) c, NULL, pos);
        } else {
-               /* Perform strict validation in all possible cases. */
-               if (mbc_buf_len == 0) {
-retry:
-                       mbc_buf_index = 1;
-                       *mbc_buf = c;
-                       if (IS_ASCII_OCTET(c)) {
-                               r = do_append((LWCHAR) c, NULL, pos);
-                       } else if (IS_UTF8_LEAD(c)) {
-                               mbc_buf_len = utf_len(c);
+               for (;;) {
+                       if (mbc_buf_index == 0)
                                mbc_pos = pos;
-                               return (0);
-                       } else {
-                               /* UTF8_INVALID or stray UTF8_TRAIL */
-                               r = flush_mbc_buf(pos);
-                       }
-               } else if (IS_UTF8_TRAIL(c)) {
                        mbc_buf[mbc_buf_index++] = c;
-                       if (mbc_buf_index < mbc_buf_len)
+                       memset(&mbs, 0, sizeof(mbs));
+                       sz = mbrtowc(&ch, mbc_buf, mbc_buf_index, &mbs);
+                       
+                       /* Incomplete UTF-8: wait for more bytes. */
+                       if (sz == (size_t)-2)
                                return (0);
-                       if (is_utf8_well_formed(mbc_buf))
-                               r = do_append(get_wchar(mbc_buf), mbc_buf,
-                                   mbc_pos);
+
+                       /* Valid UTF-8: use the character. */
+                       if (sz != (size_t)-1) {
+                               r = do_append(ch, mbc_buf, mbc_pos) ?
+                                   mbc_buf_index : 0;
+                               break;
+                       }
+
+                       /* Invalid start byte: encode it. */
+                       if (mbc_buf_index == 1) {
+                               r = store_prchar(c, pos);
+                               break;
+                       }
+
+                       /*
+                        * Invalid continuation.
+                        * Encode the preceding bytes.
+                        * If they fit, handle the interrupting byte.
+                        * Otherwise, tell the caller to back up
+                        * by the  number of bytes that do not fit,
+                        * plus one for the new byte.
+                        */
+                       mbc_buf_index--;
+                       if ((r = flush_mbc_buf(mbc_pos) + 1) == 1)
+                               mbc_buf_index = 0;
                        else
-                               /* Complete, but not shortest form, sequence. */
-                               mbc_buf_index = r = flush_mbc_buf(mbc_pos);
-                       mbc_buf_len = 0;
-               } else {
-                       /* Flush incomplete (truncated) sequence.  */
-                       r = flush_mbc_buf(mbc_pos);
-                       mbc_buf_index = r + 1;
-                       mbc_buf_len = 0;
-                       /* Handle new char.  */
-                       if (!r)
-                               goto retry;
+                               break;
                }
        }
 
@@ -765,10 +768,7 @@ retry:
                linebuf[curr] = '\0';
                pshift(hshift - cshift);
        }
-       if (r) {
-               /* How many chars should caller back up? */
-               r = (!utf_mode) ? 1 : mbc_buf_index;
-       }
+       mbc_buf_index = 0;
        return (r);
 }
 
@@ -913,10 +913,10 @@ pflushmbc(void)
 {
        int r = 0;
 
-       if (mbc_buf_len > 0) {
+       if (mbc_buf_index > 0) {
                /* Flush incomplete (truncated) sequence.  */
                r = flush_mbc_buf(mbc_pos);
-               mbc_buf_len = 0;
+               mbc_buf_index = 0;
        }
        return (r);
 }

less(1) UTF-8 cleanup: pappend()

Reply via email to