join(1) add UTF-8 support

Martijn van Duren Wed, 24 Oct 2018 02:35:03 -0700

This adds UTF-8 support for join(1). Since we don't support collation we
can skip that part of POSIX. This patch does add support for splitting  
columns on UTF-8 characters.


Using schwarze@'s favorite UTF-8 character:
$ cat /tmp/z1 
aßbßc
$ cat /tmp/z2 
aßdße
$ ./join -tß /tmp/z1 /tmp/z2
aßbßcßdße

All regression tests pass, and lightly tested.

OK?

martijn@

Index: join.c
===================================================================
RCS file: /cvs/src/usr.bin/join/join.c,v
retrieving revision 1.30
diff -u -p -r1.30 join.c
--- join.c      23 Oct 2018 08:41:45 -0000      1.30
+++ join.c      24 Oct 2018 09:32:54 -0000
@@ -34,10 +34,14 @@
  */
 
 #include <err.h>
+#include <errno.h>
+#include <limits.h>
+#include <locale.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <wchar.h>
 
 #define MAXIMUM(a, b)  (((a) > (b)) ? (a) : (b))
 
@@ -81,11 +85,12 @@ int joinout = 1;            /* show lines with mat
 int needsep;                   /* need separator character */
 int spans = 1;                 /* span multiple delimiters (-t) */
 char *empty;                   /* empty field replacement string (-e) */
-char *tabchar = " \t";         /* delimiter characters (-t) */
+wchar_t tabchar[] = L" \t";    /* delimiter characters (-t) */
 
 int  cmp(LINE *, u_long, LINE *, u_long);
 void fieldarg(char *);
 void joinlines(INPUT *, INPUT *);
+char *mbssep(char **, const wchar_t *);
 void obsolete(char **);
 void outfield(LINE *, u_long, int);
 void outoneline(INPUT *, LINE *);
@@ -101,6 +106,8 @@ main(int argc, char *argv[])
        int aflag, ch, cval, vflag;
        char *end;
 
+       setlocale(LC_CTYPE, "");
+
        if (pledge("stdio rpath", NULL) == -1)
                err(1, "pledge");
 
@@ -161,8 +168,10 @@ main(int argc, char *argv[])
                        break;
                case 't':
                        spans = 0;
-                       if (strlen(tabchar = optarg) != 1)
+                       if (mbtowc(tabchar, optarg, MB_CUR_MAX) !=
+                           strlen(optarg))
                                errx(1, "illegal tab character specification");
+                       tabchar[1] = L'\0';
                        break;
                case 'v':
                        vflag = 1;
@@ -333,7 +342,7 @@ slurp(INPUT *F)
                /* Split the line into fields, allocate space as necessary. */
                lp->fieldcnt = 0;
                bp = lp->line;
-               while ((fieldp = strsep(&bp, tabchar)) != NULL) {
+               while ((fieldp = mbssep(&bp, tabchar)) != NULL) {
                        if (spans && *fieldp == '\0')
                                continue;
                        if (lp->fieldcnt == lp->fieldalloc) {
@@ -358,6 +367,36 @@ slurp(INPUT *F)
        free(line);
 }
 
+char *
+mbssep(char **stringp, const wchar_t *wcdelim)
+{
+       char *s, *p;
+       size_t ndelim;
+       int i;
+       /* tabchar is never more than 2 */
+       char mbdelim[2][MB_LEN_MAX + 1];
+       size_t mblen[2];
+
+       if ((s = *stringp) == NULL)
+               return NULL;
+       ndelim = wcslen(wcdelim);
+       for (i = 0; i < ndelim; i++) {
+               if ((mblen[i] = wctomb(mbdelim[i], wcdelim[i])) == -1)
+                       errc(1, EILSEQ, "wctomb");
+       }
+       for (p = s; *p != '\0'; p++) {
+               for (i = 0; i < ndelim; i++) {
+                       if (strncmp(p, mbdelim[i], mblen[i]) == 0) {
+                               *p = '\0';
+                               *stringp = p + mblen[i];
+                               return s;
+                       }
+               }
+       }
+       *stringp = NULL;
+       return s;
+}
+
 int
 cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2)
 {
@@ -463,7 +502,7 @@ void
 outfield(LINE *lp, u_long fieldno, int out_empty)
 {
        if (needsep++)
-               putchar((int)*tabchar);
+               putwchar(*tabchar);
        if (!ferror(stdout)) {
                if (lp->fieldcnt <= fieldno || out_empty) {
                        if (empty != NULL)

join(1) add UTF-8 support

Reply via email to