Hello tech@,
Here's my attempt to implement UTF-8 support in column(1).
Besides the general UTF-8 conversions it does several other things to
make it behave properly. Full changelist is as follow:
- Make separator and input full UTF-8 aware.
- Do proper character width count. This also fixes some indentation
issues where the old code assumed that a tab also was one column wide.
- Replace tabs between columns with spaces. The old code worked fine,
but with UTF-8 and oxtabs in stty enabled the column positioning can get
way off, and we can't expect everyone to run with "stty -oxtabs". Found
with the help of nicm@.
OK?
martijn@
Index: column.c
===================================================================
RCS file: /cvs/src/usr.bin/column/column.c,v
retrieving revision 1.22
diff -u -p -r1.22 column.c
--- column.c 3 Nov 2015 04:55:44 -0000 1.22
+++ column.c 27 Feb 2016 12:21:35 -0000
@@ -36,15 +36,24 @@
#include <ctype.h>
#include <err.h>
#include <limits.h>
+#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <wchar.h>
+#include <wctype.h>
+
+struct li {
+ int width;
+ char *str;
+};
void c_columnate(void);
void *ereallocarray(void *, size_t, size_t);
void *ecalloc(size_t, size_t);
void input(FILE *);
+int isu8start(unsigned char);
void maketbl(void);
void print(void);
void r_columnate(void);
@@ -54,9 +63,9 @@ int termwidth = 80; /* default terminal
int entries; /* number of records */
int eval; /* exit value */
-int maxlength; /* longest record */
-char **list; /* array of pointers to records */
-char *separator = "\t "; /* field separator for table option */
+int maxwidth; /* longest record */
+struct li *list; /* array of to records */
+wchar_t *separator = L"\t "; /* field separator for table option */
int
main(int argc, char *argv[])
@@ -66,6 +75,7 @@ main(int argc, char *argv[])
int ch, tflag, xflag;
char *p;
const char *errstr;
+ int slen;
if (ioctl(1, TIOCGWINSZ, &win) == -1 || !win.ws_col) {
if ((p = getenv("COLUMNS")) && *p != '\0') {
@@ -79,6 +89,8 @@ main(int argc, char *argv[])
if (pledge("stdio rpath", NULL) == -1)
err(1, "pledge");
+ setlocale(LC_CTYPE, "");
+
tflag = xflag = 0;
while ((ch = getopt(argc, argv, "c:s:tx")) != -1)
switch(ch) {
@@ -88,7 +100,13 @@ main(int argc, char *argv[])
errx(1, "%s: %s", errstr, optarg);
break;
case 's':
- separator = optarg;
+ slen = strlen(optarg)+1;
+ if ((separator = reallocarray(NULL, slen,
+ sizeof(*separator))) == NULL)
+ err(1, NULL);
+ if (mbstowcs(separator, optarg,
+ slen * sizeof(*separator)) == (size_t)-1)
+ err(1, "Unable to set separator");
break;
case 't':
tflag = 1;
@@ -125,69 +143,59 @@ main(int argc, char *argv[])
if (tflag)
maketbl();
- else if (maxlength >= termwidth)
+ else if (maxwidth >= termwidth)
print();
else if (xflag)
c_columnate();
else
r_columnate();
- exit(eval);
+ return eval;
}
#define TAB 8
void
c_columnate(void)
{
- int chcnt, col, cnt, endcol, numcols;
- char **lp;
+ int col, numcols;
+ struct li *lp;
- maxlength = (maxlength + TAB) & ~(TAB - 1);
- numcols = termwidth / maxlength;
- endcol = maxlength;
- for (chcnt = col = 0, lp = list;; ++lp) {
- chcnt += printf("%s", *lp);
+ maxwidth = (maxwidth + TAB) & ~(TAB - 1);
+ if ((numcols = termwidth / maxwidth) == 0)
+ numcols = 1;
+ for (col = 0, lp = list;; ++lp) {
+ printf("%s", lp->str);
if (!--entries)
break;
if (++col == numcols) {
- chcnt = col = 0;
- endcol = maxlength;
+ col = 0;
putchar('\n');
} else {
- while ((cnt = ((chcnt + TAB) & ~(TAB - 1))) <= endcol) {
- (void)putchar('\t');
- chcnt = cnt;
- }
- endcol += maxlength;
+ while (lp->width++ < maxwidth)
+ (void)putchar(' ');
}
}
- if (chcnt)
- putchar('\n');
+ putchar('\n');
}
void
r_columnate(void)
{
- int base, chcnt, cnt, col, endcol, numcols, numrows, row;
+ int base, col, numcols, numrows, row;
- maxlength = (maxlength + TAB) & ~(TAB - 1);
- numcols = termwidth / maxlength;
- if (numcols == 0)
+ maxwidth = (maxwidth + TAB) & ~(TAB - 1);
+ if ((numcols = termwidth / maxwidth) == 0)
numcols = 1;
numrows = entries / numcols;
if (entries % numcols)
++numrows;
for (row = 0; row < numrows; ++row) {
- endcol = maxlength;
- for (base = row, chcnt = col = 0; col < numcols; ++col) {
- chcnt += printf("%s", list[base]);
+ for (base = row, col = 0; col < numcols; ++col) {
+ printf("%s", list[base].str);
+ while (list[base].width++ < maxwidth)
+ (void)putchar(' ');
if ((base += numrows) >= entries)
break;
- while ((cnt = ((chcnt + TAB) & ~(TAB - 1))) <= endcol) {
- (void)putchar('\t');
- chcnt = cnt;
- }
- endcol += maxlength;
}
putchar('\n');
}
@@ -197,15 +205,15 @@ void
print(void)
{
int cnt;
- char **lp;
+ struct li *lp;
for (cnt = entries, lp = list; cnt--; ++lp)
- (void)printf("%s\n", *lp);
+ (void)printf("%s\n", lp->str);
}
typedef struct _tbl {
- char **list;
- int cols, *len;
+ wchar_t **list;
+ int cols, *width;
} TBL;
#define DEFCOLS 25
@@ -214,47 +222,54 @@ maketbl(void)
{
TBL *t;
int coloff, cnt;
- char *p, **lp;
- int *lens, maxcols = DEFCOLS;
+ struct li *lp;
+ int *widths, maxcols = DEFCOLS;
TBL *tbl;
- char **cols;
+ wchar_t **cols;
+ wchar_t *ws, *wws, *last;
t = tbl = ecalloc(entries, sizeof(TBL));
cols = ereallocarray(NULL, maxcols, sizeof(char *));
- lens = ecalloc(maxcols, sizeof(int));
+ widths = ecalloc(maxcols, sizeof(int));
+ if ((ws = reallocarray(NULL, maxwidth, sizeof(*ws))) == NULL)
+ err(1, NULL);
for (cnt = 0, lp = list; cnt < entries; ++cnt, ++lp, ++t) {
- for (coloff = 0, p = *lp; (cols[coloff] = strtok(p, separator));
- p = NULL)
+ if (mbstowcs(ws, lp->str, maxwidth * sizeof(*ws)) == (size_t)-1)
+ errx(1, "Invalid char on line %d", cnt+1);
+ free(lp->str);
+ lp->str = NULL;
+ wws = ws;
+ for (coloff = 0; (cols[coloff] = wcstok(wws, separator,
+ &last)); wws = NULL) {
if (++coloff == maxcols) {
maxcols += DEFCOLS;
cols = ereallocarray(cols, maxcols,
sizeof(char *));
- lens = ereallocarray(lens, maxcols,
+ widths = ereallocarray(widths, maxcols,
sizeof(int));
- memset(lens + coloff, 0, DEFCOLS * sizeof(int));
+ memset(widths + coloff, 0, DEFCOLS *
sizeof(int));
}
+ }
if (coloff == 0)
continue;
- t->list = ecalloc(coloff, sizeof(char *));
- t->len = ecalloc(coloff, sizeof(int));
+ t->list = ecalloc(coloff, sizeof(*(t->list)));
+ t->width = ecalloc(coloff, sizeof(*(t->width)));
for (t->cols = coloff; --coloff >= 0;) {
- t->list[coloff] = cols[coloff];
- t->len[coloff] = strlen(cols[coloff]);
- if (t->len[coloff] > lens[coloff])
- lens[coloff] = t->len[coloff];
+ t->list[coloff] = wcsdup(cols[coloff]);
+ t->width[coloff] = wcswidth(cols[coloff],
+ wcslen(cols[coloff]));
+ if (t->width[coloff] > widths[coloff])
+ widths[coloff] = t->width[coloff];
}
}
for (cnt = 0, t = tbl; cnt < entries; ++cnt, ++t) {
if (t->cols > 0) {
for (coloff = 0; coloff < t->cols - 1; ++coloff)
- (void)printf("%s%*s", t->list[coloff],
- lens[coloff] - t->len[coloff] + 2, " ");
- (void)printf("%s\n", t->list[coloff]);
+ (void)printf("%ls%*s", t->list[coloff],
+ widths[coloff] - t->width[coloff] + 2, " ");
+ (void)printf("%ls\n", t->list[coloff]);
}
}
- free(tbl);
- free(lens);
- free(cols);
}
#define DEFNUM 1000
@@ -263,31 +278,56 @@ maketbl(void)
void
input(FILE *fp)
{
- static size_t maxentry = DEFNUM;
- int len;
+ static int maxentry = DEFNUM;
+ int width = 0, cwidth = 0, size;
char *p, buf[MAXLINELEN];
+ wchar_t wc;
if (!list)
list = ecalloc(maxentry, sizeof(char *));
while (fgets(buf, MAXLINELEN, fp)) {
- for (p = buf; isspace((unsigned char)*p); ++p);
+ width = 0, cwidth = 0;
+ p = buf;
+ do {
+ p += cwidth;
+ if ((cwidth = mbtowc(&wc, p, MB_CUR_MAX)) == -1) {
+ (void) mbtowc(NULL, NULL, MB_CUR_MAX);
+ break;
+ }
+ } while (iswspace(wc));
if (!*p)
continue;
- if (!(p = strchr(p, '\n'))) {
+
+ for (p = buf; *p != '\n' && *p != '\0'; p++) {
+ if (isu8start(*p)) {
+ if ((size = mbtowc(&wc, p, MB_CUR_MAX)) == -1)
+ mbtowc(NULL, NULL, MB_CUR_MAX);
+ if ((cwidth = wcwidth(wc)) >= 0) {
+ width += cwidth;
+ p += size-1;
+ }
+ } else {
+ if (*p == '\t')
+ width += TAB - width%TAB;
+ else if (isprint(*p))
+ width++;
+ }
+ }
+ if (*p != '\n') {
warnx("line too long");
eval = 1;
continue;
}
*p = '\0';
- len = p - buf;
- if (maxlength < len)
- maxlength = len;
+ if (maxwidth < width)
+ maxwidth = width;
if (entries == maxentry) {
maxentry += DEFNUM;
list = ereallocarray(list, maxentry, sizeof(char *));
memset(list + entries, 0, DEFNUM * sizeof(char *));
}
- if (!(list[entries++] = strdup(buf)))
+ list[entries].width = width;
+ if (!(list[entries++].str = strdup(buf)))
err(1, NULL);
}
}
@@ -319,4 +359,10 @@ usage(void)
(void)fprintf(stderr,
"usage: column [-tx] [-c columns] [-s sep] [file ...]\n");
exit(1);
+}
+
+int
+isu8start(unsigned char c)
+{
+ return MB_CUR_MAX > 1 && (c & (0x80 | 0x40)) == (0x80 | 0x40);
}