Re: join(1) add UTF-8 support

Martijn van Duren Mon, 05 Nov 2018 23:22:37 -0800
ping

On 10/24/18 11:34 AM, Martijn van Duren wrote:
> This adds UTF-8 support for join(1). Since we don't support collation we
> can skip that part of POSIX. This patch does add support for splitting  
> columns on UTF-8 characters.
> 
> Using schwarze@'s favorite UTF-8 character:
> $ cat /tmp/z1 
> aßbßc
> $ cat /tmp/z2 
> aßdße
> $ ./join -tß /tmp/z1 /tmp/z2
> aßbßcßdße
> 
> All regression tests pass, and lightly tested.
> 
> OK?
> 
> martijn@
> 
> Index: join.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/join/join.c,v
> retrieving revision 1.30
> diff -u -p -r1.30 join.c
> --- join.c    23 Oct 2018 08:41:45 -0000      1.30
> +++ join.c    24 Oct 2018 09:32:54 -0000
> @@ -34,10 +34,14 @@
>   */
>  
>  #include <err.h>
> +#include <errno.h>
> +#include <limits.h>
> +#include <locale.h>
>  #include <stdio.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <unistd.h>
> +#include <wchar.h>
>  
>  #define MAXIMUM(a, b)        (((a) > (b)) ? (a) : (b))
>  
> @@ -81,11 +85,12 @@ int joinout = 1;          /* show lines with mat
>  int needsep;                 /* need separator character */
>  int spans = 1;                       /* span multiple delimiters (-t) */
>  char *empty;                 /* empty field replacement string (-e) */
> -char *tabchar = " \t";               /* delimiter characters (-t) */
> +wchar_t tabchar[] = L" \t";  /* delimiter characters (-t) */
>  
>  int  cmp(LINE *, u_long, LINE *, u_long);
>  void fieldarg(char *);
>  void joinlines(INPUT *, INPUT *);
> +char *mbssep(char **, const wchar_t *);
>  void obsolete(char **);
>  void outfield(LINE *, u_long, int);
>  void outoneline(INPUT *, LINE *);
> @@ -101,6 +106,8 @@ main(int argc, char *argv[])
>       int aflag, ch, cval, vflag;
>       char *end;
>  
> +     setlocale(LC_CTYPE, "");
> +
>       if (pledge("stdio rpath", NULL) == -1)
>               err(1, "pledge");
>  
> @@ -161,8 +168,10 @@ main(int argc, char *argv[])
>                       break;
>               case 't':
>                       spans = 0;
> -                     if (strlen(tabchar = optarg) != 1)
> +                     if (mbtowc(tabchar, optarg, MB_CUR_MAX) !=
> +                         strlen(optarg))
>                               errx(1, "illegal tab character specification");
> +                     tabchar[1] = L'\0';
>                       break;
>               case 'v':
>                       vflag = 1;
> @@ -333,7 +342,7 @@ slurp(INPUT *F)
>               /* Split the line into fields, allocate space as necessary. */
>               lp->fieldcnt = 0;
>               bp = lp->line;
> -             while ((fieldp = strsep(&bp, tabchar)) != NULL) {
> +             while ((fieldp = mbssep(&bp, tabchar)) != NULL) {
>                       if (spans && *fieldp == '\0')
>                               continue;
>                       if (lp->fieldcnt == lp->fieldalloc) {
> @@ -358,6 +367,36 @@ slurp(INPUT *F)
>       free(line);
>  }
>  
> +char *
> +mbssep(char **stringp, const wchar_t *wcdelim)
> +{
> +     char *s, *p;
> +     size_t ndelim;
> +     int i;
> +     /* tabchar is never more than 2 */
> +     char mbdelim[2][MB_LEN_MAX + 1];
> +     size_t mblen[2];
> +
> +     if ((s = *stringp) == NULL)
> +             return NULL;
> +     ndelim = wcslen(wcdelim);
> +     for (i = 0; i < ndelim; i++) {
> +             if ((mblen[i] = wctomb(mbdelim[i], wcdelim[i])) == -1)
> +                     errc(1, EILSEQ, "wctomb");
> +     }
> +     for (p = s; *p != '\0'; p++) {
> +             for (i = 0; i < ndelim; i++) {
> +                     if (strncmp(p, mbdelim[i], mblen[i]) == 0) {
> +                             *p = '\0';
> +                             *stringp = p + mblen[i];
> +                             return s;
> +                     }
> +             }
> +     }
> +     *stringp = NULL;
> +     return s;
> +}
> +
>  int
>  cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2)
>  {
> @@ -463,7 +502,7 @@ void
>  outfield(LINE *lp, u_long fieldno, int out_empty)
>  {
>       if (needsep++)
> -             putchar((int)*tabchar);
> +             putwchar(*tabchar);
>       if (!ferror(stdout)) {
>               if (lp->fieldcnt <= fieldno || out_empty) {
>                       if (empty != NULL)
>
Re: join(1) add UTF-8 support

Reply via email to