Hi, Christian Weisgerber wrote on Fri, Oct 23, 2015 at 03:52:31PM +0200: > Ted Unangst:
>> I'm very scared to try counting chars vs bytes upfront in such code. Actually, that turns out to be simple. > I think that's insufficient to cover rs's functionality. -z will > overestimate the required widths and... yes, -j is completely broken. So here is a patch getting it completely right, including all options and full support for zero-width and double-width characters, invalid bytes, and non-printable characters. The structure of the code in rs.c is completely untouched, but as a side effect, it got shorter by a handful of lines and i got rid of all instances of "**". As with my ls(1) and ul(1) patches, i have put the function doing the heavy lifting with multibyte characters into a dedicated file utf8.c. Just like in those two cases, that function uses the standard interfaces mbtowc(3) and mbwidth(3). But i'm now keeping two steps seperate: First, add UTF-8 support by adding a file utf8.c using standard interfaces. Later, implement the lookup table to support the standard functions used in these interfaces in a simpler way. I think that way we can actually start committing such patches and improve our userland. Two final notes: 1. It turns out each of the three programs needs exactly one multibyte-character helper function in utf8.c, and each helper function uses mbtowc(3) and mbwidth(3), but all three helper functions have different functionality. So we don't know yet exactly which functionality will be needed most. But that's no problem because these functions are very small can comfortably live in utf8.c. 2. For this particular program, it seems best to me to keep the behaviour for LC_CTYPE=C completely unchanged; no validation of bytes whatsoever. For LC_CTYPE=en_US.UTF-8, however, that's not possible because non-printable Unicode characters have undefined width, and passing through non-printable Unicode characters seems like an even worse idea than passing through non-printable ASCII characters. So, i'm replacing valid, but non-printable Unicode characters with "?", just like i'm replacing invalid bytes with "?". Of course, this can easily be tweaked, for example to pass through non-printable characters and treat them as width 0. OK? Ingo Index: Makefile =================================================================== RCS file: /cvs/src/usr.bin/rs/Makefile,v retrieving revision 1.2 diff -u -p -r1.2 Makefile --- Makefile 26 Jun 1996 05:38:46 -0000 1.2 +++ Makefile 14 Nov 2015 17:34:09 -0000 @@ -1,6 +1,6 @@ # $OpenBSD: Makefile,v 1.2 1996/06/26 05:38:46 deraadt Exp $ - PROG= rs +SRCS= rs.c utf8.c .include <bsd.prog.mk> Index: rs.c =================================================================== RCS file: /cvs/src/usr.bin/rs/rs.c,v retrieving revision 1.29 diff -u -p -r1.29 rs.c --- rs.c 14 Nov 2015 17:03:02 -0000 1.29 +++ rs.c 14 Nov 2015 17:34:09 -0000 @@ -39,11 +39,17 @@ #include <err.h> #include <errno.h> #include <limits.h> +#include <locale.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> +struct cell { + int w; /* Display width. */ + char *s; /* Multibyte string. */ +}; + long flags; #define TRANSPOSE 000001 #define MTRANSPOSE 000002 @@ -63,26 +69,27 @@ long flags; short *colwidths; int nelem; -char **elem; -char **endelem; +struct cell *elem; +struct cell *endelem; char *curline; int allocsize = BUFSIZ; -ssize_t curlen; int irows, icols; int orows, ocols; -ssize_t maxlen; +int maxwidth; int skip; int propgutter; char isep = ' ', osep = ' '; int owidth = 80, gutter = 2; +int mbsavis(char **, const char *); + void usage(void); void getargs(int, char *[]); void getfile(void); int get_line(void); -char **getptrs(char **); +struct cell *getptrs(struct cell *); void prepfile(void); -void prints(char *, int); +void prints(struct cell *, int); void putfile(void); #define INCR(ep) do { \ @@ -93,6 +100,8 @@ void putfile(void); int main(int argc, char *argv[]) { + setlocale(LC_CTYPE, ""); + if (pledge("stdio", NULL) == -1) err(1, "pledge"); @@ -110,13 +119,14 @@ main(int argc, char *argv[]) void getfile(void) { + const char delim[2] = { isep, '\0' }; char *p; - char *endp; - char **ep = NULL; + struct cell *ep; int multisep = (flags & ONEISEPONLY ? 0 : 1); int nullpad = flags & NULLPAD; - char **padto; + struct cell *padto; + curline = NULL; while (skip--) { if (get_line() == EOF) return; @@ -125,67 +135,67 @@ getfile(void) } if (get_line() == EOF) return; - if (flags & NOARGS && curlen < owidth) + if (flags & NOARGS && strlen(curline) < (size_t)owidth) flags |= ONEPERLINE; if (flags & ONEPERLINE) icols = 1; else /* count cols on first line */ - for (p = curline, endp = curline + curlen; p < endp; p++) { + for (p = curline; *p != '\0'; p++) { if (*p == isep && multisep) continue; icols++; while (*p && *p != isep) p++; } - ep = getptrs(elem); + ep = getptrs(NULL); p = curline; do { if (flags & ONEPERLINE) { - *ep = curline; + ep->w = mbsavis(&ep->s, curline); + if (maxwidth < ep->w) + maxwidth = ep->w; INCR(ep); /* prepare for next entry */ - if (maxlen < curlen) - maxlen = curlen; irows++; continue; } - for (p = curline, endp = curline + curlen; p < endp; p++) { - if (*p == isep && multisep) - continue; /* eat up column separators */ - if (*p == isep) /* must be an empty column */ - *ep = ""; - else /* store column entry */ - *ep = p; - while (p < endp && *p != isep) - p++; /* find end of entry */ - *p = '\0'; /* mark end of entry */ - if (maxlen < p - *ep) /* update maxlen */ - maxlen = p - *ep; + p = curline; + while (p != NULL && *p != '\0') { + if (*p == isep) { + p++; + if (multisep) + continue; + ep->s = ""; /* empty column */ + ep->w = 0; + } else + ep->w = mbsavis(&ep->s, strsep(&p, delim)); + if (maxwidth < ep->w) + maxwidth = ep->w; INCR(ep); /* prepare for next entry */ } irows++; /* update row count */ if (nullpad) { /* pad missing entries */ padto = elem + irows * icols; while (ep < padto) { - *ep = ""; + ep->s = ""; + ep->w = 0; INCR(ep); } } } while (get_line() != EOF); - *ep = NULL; /* mark end of pointers */ nelem = ep - elem; } void putfile(void) { - char **ep; + struct cell *ep; int i, j, n; ep = elem; if (flags & TRANSPOSE) { for (i = 0; i < orows; i++) { for (j = i; j < nelem; j += orows) - prints(ep[j], (j - i) / orows); + prints(ep + j, (j - i) / orows); putchar('\n'); } } else { @@ -193,7 +203,7 @@ putfile(void) for (j = 0; j < ocols; j++) { if (n++ >= nelem) break; - prints(*ep++, j); + prints(ep++, j); } putchar('\n'); } @@ -201,19 +211,15 @@ putfile(void) } void -prints(char *s, int col) +prints(struct cell *ep, int col) { int n; - char *p = s; - while (*p) - p++; - n = (flags & ONEOSEPONLY ? 1 : colwidths[col] - (p - s)); + n = (flags & ONEOSEPONLY ? 1 : colwidths[col] - ep->w); if (flags & RIGHTADJUST) while (n-- > 0) putchar(osep); - for (p = s; *p; p++) - putchar(*p); + fputs(ep->s, stdout); while (n-- > 0) putchar(osep); } @@ -232,18 +238,18 @@ usage(void) void prepfile(void) { - char **ep; + struct cell *ep; int i; int j; - char **lp; + struct cell *lp; int colw; int max = 0; int n; if (!nelem) exit(0); - gutter += maxlen * propgutter / 100.0; - colw = maxlen + gutter; + gutter += maxwidth * propgutter / 100.0; + colw = maxwidth + gutter; if (flags & MTRANSPOSE) { orows = icols; ocols = irows; @@ -263,14 +269,11 @@ prepfile(void) orows = nelem / ocols + (nelem % ocols ? 1 : 0); else if (ocols == 0) /* decide on cols */ ocols = nelem / orows + (nelem % orows ? 1 : 0); - lp = elem + orows * ocols; - while (lp > endelem) { - getptrs(elem + nelem); - lp = elem + orows * ocols; - } + while ((lp = elem + orows * ocols) > endelem) + (void)getptrs(NULL); if (flags & RECYCLE) { for (ep = elem + nelem; ep < lp; ep++) - *ep = *(ep - nelem); + memcpy(ep, ep - nelem, sizeof(*ep)); nelem = lp - elem; } if (!(colwidths = calloc(ocols, sizeof(short)))) @@ -279,13 +282,13 @@ prepfile(void) for (ep = elem, i = 0; i < ocols; i++) { max = 0; if (flags & TRANSPOSE) { - for (j = 0; j < orows; j++) - if ((n = strlen(*ep++)) > max) - max = n; + for (j = 0; j < orows; j++, ep++) + if (ep->w > max) + max = ep->w; } else { for (j = i; j < nelem; j += ocols) - if ((n = strlen(ep[j])) > max) - max = n; + if (ep[j].w > max) + max = ep[j].w; } colwidths[i] = max + gutter; } @@ -305,43 +308,38 @@ prepfile(void) } int -get_line(void) /* get line; maintain curline, curlen; manage storage */ +get_line(void) { - static char *ibuf = NULL; - static size_t ibufsz = 0; + static size_t cursz; + static ssize_t curlen; if (irows > 0 && flags & DETAILSHAPE) printf(" %zd line %d\n", curlen, irows); - if ((curlen = getline(&ibuf, &ibufsz, stdin)) == EOF) { + if ((curlen = getline(&curline, &cursz, stdin)) == EOF) { if (ferror(stdin)) err(1, NULL); return EOF; } - if (curlen > 0 && ibuf[curlen - 1] == '\n') - ibuf[--curlen] = '\0'; - - if (skip >= 0 || flags & SHAPEONLY) - curline = ibuf; - else if ((curline = strdup(ibuf)) == NULL) - err(1, NULL); + if (curlen > 0 && curline[curlen - 1] == '\n') + curline[--curlen] = '\0'; return 0; } -char ** -getptrs(char **sp) +struct cell * +getptrs(struct cell *sp) { - char **p; + struct cell *p; int newsize; newsize = allocsize * 2; - p = reallocarray(elem, newsize, sizeof(char *)); + p = reallocarray(elem, newsize, sizeof(*p)); if (p == NULL) err(1, "no memory"); allocsize = newsize; - sp += p - elem; + sp = sp == NULL ? p : p + (sp - elem); elem = p; endelem = elem + allocsize; return(sp); Index: utf8.c =================================================================== RCS file: utf8.c diff -N utf8.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ utf8.c 14 Nov 2015 17:34:09 -0000 @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2015 Ingo Schwarze <schwa...@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <err.h> +#include <stdlib.h> +#include <string.h> +#include <wchar.h> + +int +mbsavis(char** outp, const char *mbs) +{ + const char *src; /* Iterate mbs. */ + char *dst; /* Iterate *outp. */ + wchar_t wc; + int total_width; /* Display width of the whole string. */ + int width; /* Display width of a single Unicode char. */ + int len; /* Length in bytes of UTF-8 encoded string. */ + + len = strlen(mbs); + if ((*outp = malloc(len + 1)) == NULL) + err(1, NULL); + + if (MB_CUR_MAX == 1) { + memcpy(*outp, mbs, len + 1); + return len; + } + + src = mbs; + dst = *outp; + total_width = 0; + while (*src != '\0') { + if ((len = mbtowc(&wc, src, MB_CUR_MAX)) == -1) { + total_width++; + *dst++ = '?'; + src++; + } else if ((width = wcwidth(wc)) == -1) { + total_width++; + *dst++ = '?'; + src += len; + } else { + total_width += width; + while (len-- > 0) + *dst++ = *src++; + } + } + *dst = '\0'; + return total_width; +}