Hello tech@,

Here's my attempt to implement UTF-8 support in column(1).
Besides the general UTF-8 conversions it does several other things to 
make it behave properly. Full changelist is as follow:
- Make separator and input full UTF-8 aware.
- Do proper character width count. This also fixes some indentation 
issues where the old code assumed that a tab also was one column wide.
- Replace tabs between columns with spaces. The old code worked fine, 
but with UTF-8 and oxtabs in stty enabled the column positioning can get
way off, and we can't expect everyone to run with "stty -oxtabs". Found
with the help of nicm@.

OK? 

martijn@

Index: column.c
===================================================================
RCS file: /cvs/src/usr.bin/column/column.c,v
retrieving revision 1.22
diff -u -p -r1.22 column.c
--- column.c    3 Nov 2015 04:55:44 -0000       1.22
+++ column.c    27 Feb 2016 12:21:35 -0000
@@ -36,15 +36,24 @@
 #include <ctype.h>
 #include <err.h>
 #include <limits.h>
+#include <locale.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <wchar.h>
+#include <wctype.h>
+
+struct li {
+       int      width;
+       char    *str;
+};
 
 void  c_columnate(void);
 void *ereallocarray(void *, size_t, size_t);
 void *ecalloc(size_t, size_t);
 void  input(FILE *);
+int   isu8start(unsigned char);
 void  maketbl(void);
 void  print(void);
 void  r_columnate(void);
@@ -54,9 +63,9 @@ int termwidth = 80;           /* default terminal
 
 int entries;                   /* number of records */
 int eval;                      /* exit value */
-int maxlength;                 /* longest record */
-char **list;                   /* array of pointers to records */
-char *separator = "\t ";       /* field separator for table option */
+int maxwidth;                  /* longest record */
+struct li *list;               /* array of  to records */
+wchar_t *separator = L"\t ";   /* field separator for table option */
 
 int
 main(int argc, char *argv[])
@@ -66,6 +75,7 @@ main(int argc, char *argv[])
        int ch, tflag, xflag;
        char *p;
        const char *errstr;
+       int slen;
 
        if (ioctl(1, TIOCGWINSZ, &win) == -1 || !win.ws_col) {
                if ((p = getenv("COLUMNS")) && *p != '\0') {
@@ -79,6 +89,8 @@ main(int argc, char *argv[])
        if (pledge("stdio rpath", NULL) == -1)
                err(1, "pledge");
 
+       setlocale(LC_CTYPE, "");
+
        tflag = xflag = 0;
        while ((ch = getopt(argc, argv, "c:s:tx")) != -1)
                switch(ch) {
@@ -88,7 +100,13 @@ main(int argc, char *argv[])
                                errx(1, "%s: %s", errstr, optarg);
                        break;
                case 's':
-                       separator = optarg;
+                       slen = strlen(optarg)+1;
+                       if ((separator = reallocarray(NULL, slen,
+                           sizeof(*separator))) == NULL)
+                               err(1, NULL);
+                       if (mbstowcs(separator, optarg,
+                           slen * sizeof(*separator)) == (size_t)-1)
+                               err(1, "Unable to set separator");
                        break;
                case 't':
                        tflag = 1;
@@ -125,69 +143,59 @@ main(int argc, char *argv[])
 
        if (tflag)
                maketbl();
-       else if (maxlength >= termwidth)
+       else if (maxwidth >= termwidth)
                print();
        else if (xflag)
                c_columnate();
        else
                r_columnate();
-       exit(eval);
+       return eval;
 }
 
 #define        TAB     8
 void
 c_columnate(void)
 {
-       int chcnt, col, cnt, endcol, numcols;
-       char **lp;
+       int col, numcols;
+       struct li *lp;
 
-       maxlength = (maxlength + TAB) & ~(TAB - 1);
-       numcols = termwidth / maxlength;
-       endcol = maxlength;
-       for (chcnt = col = 0, lp = list;; ++lp) {
-               chcnt += printf("%s", *lp);
+       maxwidth = (maxwidth + TAB) & ~(TAB - 1);
+       if ((numcols = termwidth / maxwidth) == 0)
+               numcols = 1;
+       for (col = 0, lp = list;; ++lp) {
+               printf("%s", lp->str);
                if (!--entries)
                        break;
                if (++col == numcols) {
-                       chcnt = col = 0;
-                       endcol = maxlength;
+                       col = 0;
                        putchar('\n');
                } else {
-                       while ((cnt = ((chcnt + TAB) & ~(TAB - 1))) <= endcol) {
-                               (void)putchar('\t');
-                               chcnt = cnt;
-                       }
-                       endcol += maxlength;
+                       while (lp->width++ < maxwidth)
+                               (void)putchar(' ');
                }
        }
-       if (chcnt)
-               putchar('\n');
+       putchar('\n');
 }
 
 void
 r_columnate(void)
 {
-       int base, chcnt, cnt, col, endcol, numcols, numrows, row;
+       int base, col, numcols, numrows, row;
 
-       maxlength = (maxlength + TAB) & ~(TAB - 1);
-       numcols = termwidth / maxlength;
-       if (numcols == 0)
+       maxwidth = (maxwidth + TAB) & ~(TAB - 1);
+       if ((numcols = termwidth / maxwidth) == 0)
                numcols = 1;
        numrows = entries / numcols;
        if (entries % numcols)
                ++numrows;
 
        for (row = 0; row < numrows; ++row) {
-               endcol = maxlength;
-               for (base = row, chcnt = col = 0; col < numcols; ++col) {
-                       chcnt += printf("%s", list[base]);
+               for (base = row, col = 0; col < numcols; ++col) {
+                       printf("%s", list[base].str);
+                       while (list[base].width++ < maxwidth)
+                               (void)putchar(' ');
                        if ((base += numrows) >= entries)
                                break;
-                       while ((cnt = ((chcnt + TAB) & ~(TAB - 1))) <= endcol) {
-                               (void)putchar('\t');
-                               chcnt = cnt;
-                       }
-                       endcol += maxlength;
                }
                putchar('\n');
        }
@@ -197,15 +205,15 @@ void
 print(void)
 {
        int cnt;
-       char **lp;
+       struct li *lp;
 
        for (cnt = entries, lp = list; cnt--; ++lp)
-               (void)printf("%s\n", *lp);
+               (void)printf("%s\n", lp->str);
 }
 
 typedef struct _tbl {
-       char **list;
-       int cols, *len;
+       wchar_t **list;
+       int cols, *width;
 } TBL;
 #define        DEFCOLS 25
 
@@ -214,47 +222,54 @@ maketbl(void)
 {
        TBL *t;
        int coloff, cnt;
-       char *p, **lp;
-       int *lens, maxcols = DEFCOLS;
+       struct li *lp;
+       int *widths, maxcols = DEFCOLS;
        TBL *tbl;
-       char **cols;
+       wchar_t **cols;
+       wchar_t *ws, *wws, *last;
 
        t = tbl = ecalloc(entries, sizeof(TBL));
        cols = ereallocarray(NULL, maxcols, sizeof(char *));
-       lens = ecalloc(maxcols, sizeof(int));
+       widths = ecalloc(maxcols, sizeof(int));
+       if ((ws = reallocarray(NULL, maxwidth, sizeof(*ws))) == NULL)
+               err(1, NULL);
        for (cnt = 0, lp = list; cnt < entries; ++cnt, ++lp, ++t) {
-               for (coloff = 0, p = *lp; (cols[coloff] = strtok(p, separator));
-                   p = NULL)
+               if (mbstowcs(ws, lp->str, maxwidth * sizeof(*ws)) == (size_t)-1)
+                       errx(1, "Invalid char on line %d", cnt+1);
+               free(lp->str);
+               lp->str = NULL;
+               wws = ws;
+               for (coloff = 0; (cols[coloff] = wcstok(wws, separator,
+                   &last)); wws = NULL) {
                        if (++coloff == maxcols) {
                                maxcols += DEFCOLS;
                                cols = ereallocarray(cols, maxcols, 
                                    sizeof(char *));
-                               lens = ereallocarray(lens, maxcols,
+                               widths = ereallocarray(widths, maxcols,
                                    sizeof(int));
-                               memset(lens + coloff, 0, DEFCOLS * sizeof(int));
+                               memset(widths + coloff, 0, DEFCOLS * 
sizeof(int));
                        }
+               }
                if (coloff == 0)
                        continue;
-               t->list = ecalloc(coloff, sizeof(char *));
-               t->len = ecalloc(coloff, sizeof(int));
+               t->list = ecalloc(coloff, sizeof(*(t->list)));
+               t->width = ecalloc(coloff, sizeof(*(t->width)));
                for (t->cols = coloff; --coloff >= 0;) {
-                       t->list[coloff] = cols[coloff];
-                       t->len[coloff] = strlen(cols[coloff]);
-                       if (t->len[coloff] > lens[coloff])
-                               lens[coloff] = t->len[coloff];
+                       t->list[coloff] = wcsdup(cols[coloff]);
+                       t->width[coloff] = wcswidth(cols[coloff],
+                           wcslen(cols[coloff]));
+                       if (t->width[coloff] > widths[coloff])
+                               widths[coloff] = t->width[coloff];
                }
        }
        for (cnt = 0, t = tbl; cnt < entries; ++cnt, ++t) {
                if (t->cols > 0) {
                        for (coloff = 0; coloff < t->cols - 1; ++coloff)
-                               (void)printf("%s%*s", t->list[coloff],
-                                   lens[coloff] - t->len[coloff] + 2, " ");
-                       (void)printf("%s\n", t->list[coloff]);
+                               (void)printf("%ls%*s", t->list[coloff],
+                                   widths[coloff] - t->width[coloff] + 2, " ");
+                       (void)printf("%ls\n", t->list[coloff]);
                }
        }
-       free(tbl);
-       free(lens);
-       free(cols);
 }
 
 #define        DEFNUM          1000
@@ -263,31 +278,56 @@ maketbl(void)
 void
 input(FILE *fp)
 {
-       static size_t maxentry = DEFNUM;
-       int len;
+       static int maxentry = DEFNUM;
+       int width = 0, cwidth = 0, size;
        char *p, buf[MAXLINELEN];
+       wchar_t wc;
 
        if (!list)
                list = ecalloc(maxentry, sizeof(char *));
        while (fgets(buf, MAXLINELEN, fp)) {
-               for (p = buf; isspace((unsigned char)*p); ++p);
+               width = 0, cwidth = 0;
+               p = buf;
+               do {
+                       p += cwidth;
+                       if ((cwidth = mbtowc(&wc, p, MB_CUR_MAX)) == -1) {
+                               (void) mbtowc(NULL, NULL, MB_CUR_MAX);
+                               break;
+                       }
+               } while (iswspace(wc));
                if (!*p)
                        continue;
-               if (!(p = strchr(p, '\n'))) {
+
+               for (p = buf; *p != '\n' && *p != '\0'; p++) {
+                       if (isu8start(*p)) {
+                               if ((size = mbtowc(&wc, p, MB_CUR_MAX)) == -1)
+                                       mbtowc(NULL, NULL, MB_CUR_MAX);
+                               if ((cwidth = wcwidth(wc)) >= 0) {
+                                       width += cwidth;
+                                       p += size-1;
+                               }
+                       } else {
+                               if (*p == '\t')
+                                       width += TAB - width%TAB;
+                               else if (isprint(*p))
+                                       width++;
+                       }
+               }
+               if (*p != '\n') {
                        warnx("line too long");
                        eval = 1;
                        continue;
                }
                *p = '\0';
-               len = p - buf;
-               if (maxlength < len)
-                       maxlength = len;
+               if (maxwidth < width)
+                       maxwidth = width;
                if (entries == maxentry) {
                        maxentry += DEFNUM;
                        list = ereallocarray(list, maxentry, sizeof(char *));
                        memset(list + entries, 0, DEFNUM * sizeof(char *));
                }
-               if (!(list[entries++] = strdup(buf)))
+               list[entries].width = width;
+               if (!(list[entries++].str = strdup(buf)))
                        err(1, NULL);
        }
 }
@@ -319,4 +359,10 @@ usage(void)
        (void)fprintf(stderr,
            "usage: column [-tx] [-c columns] [-s sep] [file ...]\n");
        exit(1);
+}
+
+int
+isu8start(unsigned char c)
+{
+       return MB_CUR_MAX > 1 && (c & (0x80 | 0x40)) == (0x80 | 0x40);
 }

Reply via email to