Re: utf8 in rs

Ingo Schwarze Sat, 14 Nov 2015 10:17:48 -0800

Hi,

Christian Weisgerber wrote on Fri, Oct 23, 2015 at 03:52:31PM +0200:
> Ted Unangst:


>> I'm very scared to try counting chars vs bytes upfront in such code.

Actually, that turns out to be simple.

> I think that's insufficient to cover rs's functionality.  -z will
> overestimate the required widths and... yes, -j is completely broken.

So here is a patch getting it completely right, including all options
and full support for zero-width and double-width characters, invalid
bytes, and non-printable characters.

The structure of the code in rs.c is completely untouched, but as
a side effect, it got shorter by a handful of lines and i got rid
of all instances of "**".

As with my ls(1) and ul(1) patches, i have put the function doing
the heavy lifting with multibyte characters into a dedicated file
utf8.c.  Just like in those two cases, that function uses the
standard interfaces mbtowc(3) and mbwidth(3).

But i'm now keeping two steps seperate:  First, add UTF-8 support
by adding a file utf8.c using standard interfaces.  Later, implement
the lookup table to support the standard functions used in these
interfaces in a simpler way.

I think that way we can actually start committing such patches and
improve our userland.

Two final notes:

 1. It turns out each of the three programs needs exactly one
    multibyte-character helper function in utf8.c, and each helper
    function uses mbtowc(3) and mbwidth(3), but all three helper
    functions have different functionality.  So we don't know yet
    exactly which functionality will be needed most.  But that's
    no problem because these functions are very small can comfortably
    live in utf8.c.

 2. For this particular program, it seems best to me to keep the
    behaviour for LC_CTYPE=C completely unchanged; no validation
    of bytes whatsoever.
    For LC_CTYPE=en_US.UTF-8, however, that's not possible because
    non-printable Unicode characters have undefined width, and
    passing through non-printable Unicode characters seems like
    an even worse idea than passing through non-printable ASCII
    characters.
    So, i'm replacing valid, but non-printable Unicode characters
    with "?", just like i'm replacing invalid bytes with "?".
    Of course, this can easily be tweaked, for example to pass
    through non-printable characters and treat them as width 0.

OK?
  Ingo


Index: Makefile
===================================================================
RCS file: /cvs/src/usr.bin/rs/Makefile,v
retrieving revision 1.2
diff -u -p -r1.2 Makefile
--- Makefile    26 Jun 1996 05:38:46 -0000      1.2
+++ Makefile    14 Nov 2015 17:34:09 -0000
@@ -1,6 +1,6 @@
 #      $OpenBSD: Makefile,v 1.2 1996/06/26 05:38:46 deraadt Exp $
 
-
 PROG=  rs
+SRCS=  rs.c utf8.c
 
 .include <bsd.prog.mk>
Index: rs.c
===================================================================
RCS file: /cvs/src/usr.bin/rs/rs.c,v
retrieving revision 1.29
diff -u -p -r1.29 rs.c
--- rs.c        14 Nov 2015 17:03:02 -0000      1.29
+++ rs.c        14 Nov 2015 17:34:09 -0000
@@ -39,11 +39,17 @@
 #include <err.h>
 #include <errno.h>
 #include <limits.h>
+#include <locale.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
+struct cell {
+       int      w;  /* Display width. */
+       char    *s;  /* Multibyte string. */
+};
+
 long   flags;
 #define        TRANSPOSE       000001
 #define        MTRANSPOSE      000002
@@ -63,26 +69,27 @@ long        flags;
 
 short  *colwidths;
 int    nelem;
-char   **elem;
-char   **endelem;
+struct cell *elem;
+struct cell *endelem;
 char   *curline;
 int    allocsize = BUFSIZ;
-ssize_t        curlen;
 int    irows, icols;
 int    orows, ocols;
-ssize_t        maxlen;
+int    maxwidth;
 int    skip;
 int    propgutter;
 char   isep = ' ', osep = ' ';
 int    owidth = 80, gutter = 2;
 
+int      mbsavis(char **, const char *);
+
 void     usage(void);
 void     getargs(int, char *[]);
 void     getfile(void);
 int      get_line(void);
-char   **getptrs(char **);
+struct cell *getptrs(struct cell *);
 void     prepfile(void);
-void     prints(char *, int);
+void     prints(struct cell *, int);
 void     putfile(void);
 
 #define INCR(ep) do {                  \
@@ -93,6 +100,8 @@ void   putfile(void);
 int
 main(int argc, char *argv[])
 {
+       setlocale(LC_CTYPE, "");
+
        if (pledge("stdio", NULL) == -1)
                err(1, "pledge");
 
@@ -110,13 +119,14 @@ main(int argc, char *argv[])
 void
 getfile(void)
 {
+       const char delim[2] = { isep, '\0' };
        char *p;
-       char *endp;
-       char **ep = NULL;
+       struct cell *ep;
        int multisep = (flags & ONEISEPONLY ? 0 : 1);
        int nullpad = flags & NULLPAD;
-       char **padto;
+       struct cell *padto;
 
+       curline = NULL;
        while (skip--) {
                if (get_line() == EOF)
                        return;
@@ -125,67 +135,67 @@ getfile(void)
        }
        if (get_line() == EOF)
                return;
-       if (flags & NOARGS && curlen < owidth)
+       if (flags & NOARGS && strlen(curline) < (size_t)owidth)
                flags |= ONEPERLINE;
        if (flags & ONEPERLINE)
                icols = 1;
        else                            /* count cols on first line */
-               for (p = curline, endp = curline + curlen; p < endp; p++) {
+               for (p = curline; *p != '\0'; p++) {
                        if (*p == isep && multisep)
                                continue;
                        icols++;
                        while (*p && *p != isep)
                                p++;
                }
-       ep = getptrs(elem);
+       ep = getptrs(NULL);
        p = curline;
        do {
                if (flags & ONEPERLINE) {
-                       *ep = curline;
+                       ep->w = mbsavis(&ep->s, curline);
+                       if (maxwidth < ep->w)
+                               maxwidth = ep->w;
                        INCR(ep);               /* prepare for next entry */
-                       if (maxlen < curlen)
-                               maxlen = curlen;
                        irows++;
                        continue;
                }
-               for (p = curline, endp = curline + curlen; p < endp; p++) {
-                       if (*p == isep && multisep)
-                               continue;       /* eat up column separators */
-                       if (*p == isep)         /* must be an empty column */
-                               *ep = "";
-                       else                    /* store column entry */
-                               *ep = p;
-                       while (p < endp && *p != isep)
-                               p++;            /* find end of entry */
-                       *p = '\0';              /* mark end of entry */
-                       if (maxlen < p - *ep)   /* update maxlen */
-                               maxlen = p - *ep;
+               p = curline;
+               while (p != NULL && *p != '\0') {
+                       if (*p == isep) {
+                               p++;
+                               if (multisep)
+                                       continue;
+                               ep->s = "";     /* empty column */
+                               ep->w = 0;
+                       } else
+                               ep->w = mbsavis(&ep->s, strsep(&p, delim));
+                       if (maxwidth < ep->w)
+                               maxwidth = ep->w;
                        INCR(ep);               /* prepare for next entry */
                }
                irows++;                        /* update row count */
                if (nullpad) {                  /* pad missing entries */
                        padto = elem + irows * icols;
                        while (ep < padto) {
-                               *ep = "";
+                               ep->s = "";
+                               ep->w = 0;
                                INCR(ep);
                        }
                }
        } while (get_line() != EOF);
-       *ep = NULL;                             /* mark end of pointers */
        nelem = ep - elem;
 }
 
 void
 putfile(void)
 {
-       char **ep;
+       struct cell *ep;
        int i, j, n;
 
        ep = elem;
        if (flags & TRANSPOSE) {
                for (i = 0; i < orows; i++) {
                        for (j = i; j < nelem; j += orows)
-                               prints(ep[j], (j - i) / orows);
+                               prints(ep + j, (j - i) / orows);
                        putchar('\n');
                }
        } else {
@@ -193,7 +203,7 @@ putfile(void)
                        for (j = 0; j < ocols; j++) {
                                if (n++ >= nelem)
                                        break;
-                               prints(*ep++, j);
+                               prints(ep++, j);
                        }
                        putchar('\n');
                }
@@ -201,19 +211,15 @@ putfile(void)
 }
 
 void
-prints(char *s, int col)
+prints(struct cell *ep, int col)
 {
        int n;
-       char *p = s;
 
-       while (*p)
-               p++;
-       n = (flags & ONEOSEPONLY ? 1 : colwidths[col] - (p - s));
+       n = (flags & ONEOSEPONLY ? 1 : colwidths[col] - ep->w);
        if (flags & RIGHTADJUST)
                while (n-- > 0)
                        putchar(osep);
-       for (p = s; *p; p++)
-               putchar(*p);
+       fputs(ep->s, stdout);
        while (n-- > 0)
                putchar(osep);
 }
@@ -232,18 +238,18 @@ usage(void)
 void
 prepfile(void)
 {
-       char **ep;
+       struct cell *ep;
        int  i;
        int  j;
-       char **lp;
+       struct cell *lp;
        int colw;
        int max = 0;
        int n;
 
        if (!nelem)
                exit(0);
-       gutter += maxlen * propgutter / 100.0;
-       colw = maxlen + gutter;
+       gutter += maxwidth * propgutter / 100.0;
+       colw = maxwidth + gutter;
        if (flags & MTRANSPOSE) {
                orows = icols;
                ocols = irows;
@@ -263,14 +269,11 @@ prepfile(void)
                orows = nelem / ocols + (nelem % ocols ? 1 : 0);
        else if (ocols == 0)                    /* decide on cols */
                ocols = nelem / orows + (nelem % orows ? 1 : 0);
-       lp = elem + orows * ocols;
-       while (lp > endelem) {
-               getptrs(elem + nelem);
-               lp = elem + orows * ocols;
-       }
+       while ((lp = elem + orows * ocols) > endelem)
+            (void)getptrs(NULL);
        if (flags & RECYCLE) {
                for (ep = elem + nelem; ep < lp; ep++)
-                       *ep = *(ep - nelem);
+                       memcpy(ep, ep - nelem, sizeof(*ep));
                nelem = lp - elem;
        }
        if (!(colwidths = calloc(ocols, sizeof(short))))
@@ -279,13 +282,13 @@ prepfile(void)
                for (ep = elem, i = 0; i < ocols; i++) {
                        max = 0;
                        if (flags & TRANSPOSE) {
-                               for (j = 0; j < orows; j++)
-                                       if ((n = strlen(*ep++)) > max)
-                                               max = n;
+                               for (j = 0; j < orows; j++, ep++)
+                                       if (ep->w > max)
+                                               max = ep->w;
                        } else {
                                for (j = i; j < nelem; j += ocols)
-                                       if ((n = strlen(ep[j])) > max)
-                                               max = n;
+                                       if (ep[j].w > max)
+                                               max = ep[j].w;
                        }
                        colwidths[i] = max + gutter;
                }
@@ -305,43 +308,38 @@ prepfile(void)
 }
 
 int
-get_line(void) /* get line; maintain curline, curlen; manage storage */
+get_line(void)
 {
-       static  char    *ibuf = NULL;
-       static  size_t   ibufsz = 0;
+       static  size_t   cursz;
+       static  ssize_t  curlen;
 
        if (irows > 0 && flags & DETAILSHAPE)
                printf(" %zd line %d\n", curlen, irows);
 
-       if ((curlen = getline(&ibuf, &ibufsz, stdin)) == EOF) {
+       if ((curlen = getline(&curline, &cursz, stdin)) == EOF) {
                if (ferror(stdin))
                        err(1, NULL);
                return EOF;
        }
-       if (curlen > 0 && ibuf[curlen - 1] == '\n')
-               ibuf[--curlen] = '\0';
-
-       if (skip >= 0 || flags & SHAPEONLY)
-               curline = ibuf;
-       else if ((curline = strdup(ibuf)) == NULL)
-               err(1, NULL);
+       if (curlen > 0 && curline[curlen - 1] == '\n')
+               curline[--curlen] = '\0';
 
        return 0;
 }
 
-char **
-getptrs(char **sp)
+struct cell *
+getptrs(struct cell *sp)
 {
-       char **p;
+       struct cell *p;
        int newsize;
 
        newsize = allocsize * 2;
-       p = reallocarray(elem, newsize, sizeof(char *));
+       p = reallocarray(elem, newsize, sizeof(*p));
        if (p == NULL)
                err(1, "no memory");
 
        allocsize = newsize;
-       sp += p - elem;
+       sp = sp == NULL ? p : p + (sp - elem);
        elem = p;
        endelem = elem + allocsize;
        return(sp);
Index: utf8.c
===================================================================
RCS file: utf8.c
diff -N utf8.c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ utf8.c      14 Nov 2015 17:34:09 -0000
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2015 Ingo Schwarze <schwa...@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <err.h>
+#include <stdlib.h>
+#include <string.h>
+#include <wchar.h>
+
+int
+mbsavis(char** outp, const char *mbs)
+{
+       const char *src;  /* Iterate mbs. */
+       char     *dst;  /* Iterate *outp. */
+       wchar_t   wc;
+       int       total_width;  /* Display width of the whole string. */
+       int       width;  /* Display width of a single Unicode char. */
+       int       len;  /* Length in bytes of UTF-8 encoded string. */
+
+       len = strlen(mbs);
+       if ((*outp = malloc(len + 1)) == NULL)
+               err(1, NULL);
+
+       if (MB_CUR_MAX == 1) {
+               memcpy(*outp, mbs, len + 1);
+               return len;
+       }
+
+       src = mbs;
+       dst = *outp;
+       total_width = 0;
+       while (*src != '\0') {
+               if ((len = mbtowc(&wc, src, MB_CUR_MAX)) == -1) {
+                       total_width++;
+                       *dst++ = '?';
+                       src++;
+               } else if ((width = wcwidth(wc)) == -1) {
+                       total_width++;
+                       *dst++ = '?';
+                       src += len;
+               } else {
+                       total_width += width;
+                       while (len-- > 0)
+                               *dst++ = *src++;
+               }
+       }
+       *dst = '\0';
+       return total_width;
+}

Re: utf8 in rs

Reply via email to