This adds UTF-8 support for join(1). Since we don't support collation we
can skip that part of POSIX. This patch does add support for splitting
columns on UTF-8 characters.
Using schwarze@'s favorite UTF-8 character:
$ cat /tmp/z1
aßbßc
$ cat /tmp/z2
aßdße
$ ./join -tß /tmp/z1 /tmp/z2
aßbßcßdße
All regression tests pass, and lightly tested.
OK?
martijn@
Index: join.c
===================================================================
RCS file: /cvs/src/usr.bin/join/join.c,v
retrieving revision 1.30
diff -u -p -r1.30 join.c
--- join.c 23 Oct 2018 08:41:45 -0000 1.30
+++ join.c 24 Oct 2018 09:32:54 -0000
@@ -34,10 +34,14 @@
*/
#include <err.h>
+#include <errno.h>
+#include <limits.h>
+#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <wchar.h>
#define MAXIMUM(a, b) (((a) > (b)) ? (a) : (b))
@@ -81,11 +85,12 @@ int joinout = 1; /* show lines with mat
int needsep; /* need separator character */
int spans = 1; /* span multiple delimiters (-t) */
char *empty; /* empty field replacement string (-e) */
-char *tabchar = " \t"; /* delimiter characters (-t) */
+wchar_t tabchar[] = L" \t"; /* delimiter characters (-t) */
int cmp(LINE *, u_long, LINE *, u_long);
void fieldarg(char *);
void joinlines(INPUT *, INPUT *);
+char *mbssep(char **, const wchar_t *);
void obsolete(char **);
void outfield(LINE *, u_long, int);
void outoneline(INPUT *, LINE *);
@@ -101,6 +106,8 @@ main(int argc, char *argv[])
int aflag, ch, cval, vflag;
char *end;
+ setlocale(LC_CTYPE, "");
+
if (pledge("stdio rpath", NULL) == -1)
err(1, "pledge");
@@ -161,8 +168,10 @@ main(int argc, char *argv[])
break;
case 't':
spans = 0;
- if (strlen(tabchar = optarg) != 1)
+ if (mbtowc(tabchar, optarg, MB_CUR_MAX) !=
+ strlen(optarg))
errx(1, "illegal tab character specification");
+ tabchar[1] = L'\0';
break;
case 'v':
vflag = 1;
@@ -333,7 +342,7 @@ slurp(INPUT *F)
/* Split the line into fields, allocate space as necessary. */
lp->fieldcnt = 0;
bp = lp->line;
- while ((fieldp = strsep(&bp, tabchar)) != NULL) {
+ while ((fieldp = mbssep(&bp, tabchar)) != NULL) {
if (spans && *fieldp == '\0')
continue;
if (lp->fieldcnt == lp->fieldalloc) {
@@ -358,6 +367,36 @@ slurp(INPUT *F)
free(line);
}
+char *
+mbssep(char **stringp, const wchar_t *wcdelim)
+{
+ char *s, *p;
+ size_t ndelim;
+ int i;
+ /* tabchar is never more than 2 */
+ char mbdelim[2][MB_LEN_MAX + 1];
+ size_t mblen[2];
+
+ if ((s = *stringp) == NULL)
+ return NULL;
+ ndelim = wcslen(wcdelim);
+ for (i = 0; i < ndelim; i++) {
+ if ((mblen[i] = wctomb(mbdelim[i], wcdelim[i])) == -1)
+ errc(1, EILSEQ, "wctomb");
+ }
+ for (p = s; *p != '\0'; p++) {
+ for (i = 0; i < ndelim; i++) {
+ if (strncmp(p, mbdelim[i], mblen[i]) == 0) {
+ *p = '\0';
+ *stringp = p + mblen[i];
+ return s;
+ }
+ }
+ }
+ *stringp = NULL;
+ return s;
+}
+
int
cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2)
{
@@ -463,7 +502,7 @@ void
outfield(LINE *lp, u_long fieldno, int out_empty)
{
if (needsep++)
- putchar((int)*tabchar);
+ putwchar(*tabchar);
if (!ferror(stdout)) {
if (lp->fieldcnt <= fieldno || out_empty) {
if (empty != NULL)