ping
On 10/24/18 11:34 AM, Martijn van Duren wrote:
> This adds UTF-8 support for join(1). Since we don't support collation we
> can skip that part of POSIX. This patch does add support for splitting
> columns on UTF-8 characters.
>
> Using schwarze@'s favorite UTF-8 character:
> $ cat /tmp/z1
> aßbßc
> $ cat /tmp/z2
> aßdße
> $ ./join -tß /tmp/z1 /tmp/z2
> aßbßcßdße
>
> All regression tests pass, and lightly tested.
>
> OK?
>
> martijn@
>
> Index: join.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/join/join.c,v
> retrieving revision 1.30
> diff -u -p -r1.30 join.c
> --- join.c 23 Oct 2018 08:41:45 -0000 1.30
> +++ join.c 24 Oct 2018 09:32:54 -0000
> @@ -34,10 +34,14 @@
> */
>
> #include <err.h>
> +#include <errno.h>
> +#include <limits.h>
> +#include <locale.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> +#include <wchar.h>
>
> #define MAXIMUM(a, b) (((a) > (b)) ? (a) : (b))
>
> @@ -81,11 +85,12 @@ int joinout = 1; /* show lines with mat
> int needsep; /* need separator character */
> int spans = 1; /* span multiple delimiters (-t) */
> char *empty; /* empty field replacement string (-e) */
> -char *tabchar = " \t"; /* delimiter characters (-t) */
> +wchar_t tabchar[] = L" \t"; /* delimiter characters (-t) */
>
> int cmp(LINE *, u_long, LINE *, u_long);
> void fieldarg(char *);
> void joinlines(INPUT *, INPUT *);
> +char *mbssep(char **, const wchar_t *);
> void obsolete(char **);
> void outfield(LINE *, u_long, int);
> void outoneline(INPUT *, LINE *);
> @@ -101,6 +106,8 @@ main(int argc, char *argv[])
> int aflag, ch, cval, vflag;
> char *end;
>
> + setlocale(LC_CTYPE, "");
> +
> if (pledge("stdio rpath", NULL) == -1)
> err(1, "pledge");
>
> @@ -161,8 +168,10 @@ main(int argc, char *argv[])
> break;
> case 't':
> spans = 0;
> - if (strlen(tabchar = optarg) != 1)
> + if (mbtowc(tabchar, optarg, MB_CUR_MAX) !=
> + strlen(optarg))
> errx(1, "illegal tab character specification");
> + tabchar[1] = L'\0';
> break;
> case 'v':
> vflag = 1;
> @@ -333,7 +342,7 @@ slurp(INPUT *F)
> /* Split the line into fields, allocate space as necessary. */
> lp->fieldcnt = 0;
> bp = lp->line;
> - while ((fieldp = strsep(&bp, tabchar)) != NULL) {
> + while ((fieldp = mbssep(&bp, tabchar)) != NULL) {
> if (spans && *fieldp == '\0')
> continue;
> if (lp->fieldcnt == lp->fieldalloc) {
> @@ -358,6 +367,36 @@ slurp(INPUT *F)
> free(line);
> }
>
> +char *
> +mbssep(char **stringp, const wchar_t *wcdelim)
> +{
> + char *s, *p;
> + size_t ndelim;
> + int i;
> + /* tabchar is never more than 2 */
> + char mbdelim[2][MB_LEN_MAX + 1];
> + size_t mblen[2];
> +
> + if ((s = *stringp) == NULL)
> + return NULL;
> + ndelim = wcslen(wcdelim);
> + for (i = 0; i < ndelim; i++) {
> + if ((mblen[i] = wctomb(mbdelim[i], wcdelim[i])) == -1)
> + errc(1, EILSEQ, "wctomb");
> + }
> + for (p = s; *p != '\0'; p++) {
> + for (i = 0; i < ndelim; i++) {
> + if (strncmp(p, mbdelim[i], mblen[i]) == 0) {
> + *p = '\0';
> + *stringp = p + mblen[i];
> + return s;
> + }
> + }
> + }
> + *stringp = NULL;
> + return s;
> +}
> +
> int
> cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2)
> {
> @@ -463,7 +502,7 @@ void
> outfield(LINE *lp, u_long fieldno, int out_empty)
> {
> if (needsep++)
> - putchar((int)*tabchar);
> + putwchar(*tabchar);
> if (!ferror(stdout)) {
> if (lp->fieldcnt <= fieldno || out_empty) {
> if (empty != NULL)
>