commit cd0b771cbbb56096335c1f2d8a0c953d46b0d430
Author: Wolfgang Corcoran-Mathe <[email protected]>
Date:   Mon Apr 20 11:23:20 2015 +0100

    Add join(1)

diff --git a/LICENSE b/LICENSE
index b1baebc..5ebeb0c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -58,3 +58,4 @@ Authors/contributors include:
 © 2015 Tai Chi Minh Ralph Eastwood <[email protected]>
 © 2015 Quentin Rameau <[email protected]>
 © 2015 Dionysis Grigoropoulos <[email protected]>
+© 2015 Wolfgang Corcoran-Mathe <[email protected]>
diff --git a/Makefile b/Makefile
index d782193..ffa8678 100644
--- a/Makefile
+++ b/Makefile
@@ -99,6 +99,7 @@ BIN =\
        fold\
        grep\
        head\
+       join\
        hostname\
        kill\
        link\
diff --git a/README b/README
index cdc2aaa..2009e49 100644
--- a/README
+++ b/README
@@ -40,6 +40,7 @@ The following tools are implemented:
 =* o grep        .
 =*|o head        .
 =*|x hostname    .
+=* o join        .
 =*|o kill        .
 =*|o link        .
 =*|o ln          .
diff --git a/TODO b/TODO
index e0055c7..2267def 100644
--- a/TODO
+++ b/TODO
@@ -10,7 +10,6 @@ diff
 ed
 getconf
 install
-join
 od
 patch
 pathchk
diff --git a/join.1 b/join.1
new file mode 100644
index 0000000..66d782f
--- /dev/null
+++ b/join.1
@@ -0,0 +1,105 @@
+.Dd April 18, 2015
+.Dt JOIN 1
+.Os sbase
+.Sh NAME
+.Nm join
+.Nd relational database operator
+.Sh SYNOPSIS
+.Nm
+.Op Fl 1 Ar field
+.Op Fl 2 Ar field
+.Op Fl o Ar list
+.Op Fl e Ar string
+.Op Fl a Ar fileno | Fl v Ar fileno
+.Op Fl t Ar delim
+.Ar file1 file2
+.Sh DESCRIPTION
+.Nm
+lines from
+.Ar file1
+and
+.Ar file2
+on a matching field. If one of the input files is '-', standard input
+is read for that file.
+.Pp
+Files are read sequentially and are assumed to be sorted on the join
+field.
+.Nm
+does not check the order of input, and joining two unsorted files will
+produce unexpected output.
+.Pp
+By default, input lines are matched on the first blank-separated
+field; output lines are space-separated and consist of the join field
+followed by the remaining fields from
+.Ar file1 Ns ,
+then the remaining fields from
+.Ar file2 Ns .
+.Sh OPTIONS
+.Bl -tag -width Ds
+.It Fl 1 Ar field
+Join on the
+.Ar field Ns eth
+field of file 1.
+.It Fl 2 Ar field
+Join on the
+.Ar field Ns eth
+field of file 2.
+.It Fl a Ar fileno
+Print unpairable lines from file
+.Ar fileno
+in addition to normal output.
+.It Fl e Ar string
+When used with
+.Fl o Ns ,
+replace empty fields in the output list with
+.Ar string Ns .
+.It Fl o Ar list
+Format output according to the string
+.Ar list Ns .
+Each element of
+.Ar list
+may be either
+.Ar fileno.field
+or 0 (representing the join field).
+Elements in
+.Ar list
+may be separated by blanks or commas. For example,
+.Bd -literal -offset indent
+join -o "0 2.1 1.3"
+.Ed
+.Pp
+would print the join field, the first field of
+.Ar file2 Ns ,
+then the third field of
+.Ar file1 Ns .
+.Pp
+Only paired lines are formatted with the
+.Fl o
+option. Unpairable lines (selected with
+.Fl a
+or
+.Fl v Ns )
+are printed raw.
+.It Fl t Ar delim
+Use the arbitrary string
+.Ar delim
+as field delimiter for both input and output.
+.It Fl v Ar fileno
+Print unpairable lines from file
+.Ar fileno
+instead of normal output.
+.El
+.Sh STANDARDS
+The
+.Nm
+utility is compliant with the
+.St -p1003.1-2013
+specification with the following exeption:
+.Bl -bullet -offset indent
+.It
+Unpairable lines ignore formatting specified with
+.Fl o Ns .
+.El
+.Pp
+The possibility of specifying multibyte delimiters of arbitrary
+length is an extension to the specification.
diff --git a/join.c b/join.c
new file mode 100644
index 0000000..ab6fff3
--- /dev/null
+++ b/join.c
@@ -0,0 +1,554 @@
+#include <ctype.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "arg.h"
+#include "text.h"
+#include "utf.h"
+#include "util.h"
+
+enum {
+       INIT = 1,
+       GROW = 2,
+};
+
+enum {
+       EXPAND = 0,
+       RESET  = 1,
+};
+
+enum { FIELD_ERROR = -2, };
+
+struct field {
+       char *s;
+       size_t len;
+};
+
+struct line {
+       char *text;
+       size_t nf;
+       size_t maxf;
+       struct field *fields;
+};
+
+struct spec {
+       size_t fileno;
+       size_t fldno;
+};
+
+struct outlist {
+       size_t ns;
+       size_t maxs;
+       struct spec **specs;
+};
+
+struct span {
+       size_t nl;
+       size_t maxl;
+       struct line **lines;
+};
+
+static char *sep = NULL;
+static char *replace = NULL;
+static const char defaultofs = ' ';
+static const int jfield = 1;            /* POSIX default join field */
+static int unpairsa = 0, unpairsb = 0;
+static int oflag = 0;
+static int pairs = 1;
+static size_t seplen;
+static struct outlist output;
+
+char *argv0;
+
+
+static void
+usage(void)
+{
+       eprintf("usage: %s [-1 field] [-2 field] [-o list] [-e string] "
+               "[-a | -v fileno] [-t delim] file1 file2\n", argv0);
+}
+
+static void
+prfield(struct field *fp)
+{
+       if (fwrite(fp->s, 1, fp->len, stdout) != fp->len)
+               eprintf("fwrite:");
+}
+
+static void
+prsep(void)
+{
+       if (sep)
+               fwrite(sep, 1, seplen, stdout);
+       else
+               putchar(defaultofs);
+}
+
+static void
+swaplines(struct line *la, struct line *lb)
+{
+       struct line tmp;
+
+       tmp = *la;
+       *la = *lb;
+       *lb = tmp;
+}
+
+static void
+prjoin(struct line *la, struct line *lb, size_t jfa, size_t jfb)
+{
+       struct spec *sp;
+       struct field *joinfield;
+       size_t i;
+
+       if (jfa >= la->nf || jfb >= lb->nf)
+               return;
+
+       joinfield = &la->fields[jfa];
+
+       if (oflag) {
+               for (i = 0; i < output.ns; i++) {
+                       sp = output.specs[i];
+
+                       if (sp->fileno == 1) {
+                               if (sp->fldno < la->nf)
+                                       prfield(&la->fields[sp->fldno]);
+                               else if (replace)
+                                       fputs(replace, stdout);
+                       } else if (sp->fileno == 2) {
+                               if (sp->fldno < lb->nf)
+                                       prfield(&lb->fields[sp->fldno]);
+                               else if (replace)
+                                       fputs(replace, stdout);
+                       } else if (sp->fileno == 0) {
+                               prfield(joinfield);
+                       }
+
+                       if (i < output.ns - 1)
+                               prsep();
+               }
+       } else {
+               prfield(joinfield);
+               prsep();
+
+               for (i = 0; i < la->nf; i++) {
+                       if (i != jfa) {
+                               prfield(&la->fields[i]);
+                               prsep();
+                       }
+               }
+               for (i = 0; i < lb->nf; i++) {
+                       if (i != jfb) {
+                               prfield(&lb->fields[i]);
+                               if (i < la->nf - 1)
+                                       prsep();
+                       }
+               }
+       }
+
+       putchar('\n');
+}
+
+static void
+prline(struct line *lp)
+{
+       size_t len = strlen(lp->text);
+
+       if (fwrite(lp->text, 1, len, stdout) != len)
+               eprintf("fwrite:");
+
+       putchar('\n');
+}
+
+static int
+linecmp(struct line *la, struct line *lb, size_t jfa, size_t jfb)
+{
+       int status;
+
+       /* return FIELD_ERROR if both lines are short */
+       if (jfa >= la->nf) {
+               status = jfb >= lb->nf ? FIELD_ERROR : -1;
+       } else if (jfb >= lb->nf) {
+               status = 1;
+       } else {
+               status = memcmp(la->fields[jfa].s, lb->fields[jfb].s,
+               MAX (la->fields[jfa].len, lb->fields[jfb].len));
+               if (status > 0)
+                       status = 1;
+               else if (status < 0)
+                       status = -1;
+       }
+
+       return status;
+}
+
+static void
+addfield(struct line *lp, char *sp, size_t len)
+{
+       if (lp->nf >= lp->maxf) {
+               lp->fields = ereallocarray(lp->fields, (GROW * lp->maxf),
+                       sizeof(struct field));
+               lp->maxf *= GROW;
+       }
+       lp->fields[lp->nf].s = sp;
+       lp->fields[lp->nf].len = len;
+       lp->nf++;
+}
+
+static void
+prspanjoin(struct span *spa, struct span *spb, size_t jfa, size_t jfb)
+{
+       size_t i, j;
+
+       for (i = 0; i < (spa->nl - 1); i++)
+               for (j = 0; j < (spb->nl - 1); j++)
+                       prjoin(spa->lines[i], spb->lines[j], jfa, jfb);
+}
+
+static struct line *
+makeline(char *s, size_t len)
+{
+       struct line *lp;
+       char *sp, *beg, *end;
+       size_t i;
+       int eol = 0;
+
+       if (s[len-1] == '\n')
+               s[len-1] = '\0';
+
+       lp = ereallocarray(NULL, INIT, sizeof(struct line));
+       lp->text = s;
+       lp->fields = ereallocarray(NULL, INIT, sizeof(struct field));
+       lp->nf = 0;
+       lp->maxf = INIT;
+
+       for (sp = lp->text; isblank(*sp); sp++)
+               ;
+
+       while (!eol) {
+               beg = sp;
+
+               if (sep) {
+                       if (!(end = utfutf(sp, sep)))
+                               eol = 1;
+
+                       if (!eol) {
+                               addfield(lp, beg, end - beg);
+                               for (i = 0; i < seplen; i++)
+                                       end++;
+                       }
+               } else {
+                       for (end = sp; !(isblank(*end)); end++) {
+                               if (*end == '\0') {
+                                       eol = 1;
+                                       break;
+                               }
+                       }
+
+                       if (!eol)
+                               addfield(lp, beg, end - beg);
+                       while (isblank(*++end))
+                               ;
+               }
+
+               if (eol)
+                       addfield(lp, beg, strlen(sp));
+
+               sp = end;
+       }
+
+       return lp;
+}
+
+static int
+addtospan(struct span *sp, FILE *fp, int reset)
+{
+       char *newl = NULL;
+       size_t len, size = 0;
+
+       if ((len = getline(&newl, &size, fp)) == -1) {
+               if (ferror(fp))
+                       eprintf("getline:");
+               else
+                       return 0;
+       }
+
+       if (reset)
+               sp->nl = 0;
+
+       if (sp->nl >= sp->maxl) {
+               sp->lines = ereallocarray(sp->lines, (GROW * sp->maxl),
+                       sizeof(struct line *));
+               sp->maxl *= GROW;
+       }
+
+       sp->lines[sp->nl] = makeline(newl, len);
+       sp->nl++;
+       return 1;
+}
+
+static void
+initspan(struct span *sp)
+{
+       sp->nl = 0;
+       sp->maxl = INIT;
+       sp->lines = ereallocarray(NULL, INIT, sizeof(struct line *));;
+}
+
+static void
+freespan(struct span *sp)
+{
+       size_t i;
+
+       for (i = 0; i < sp->nl; i++) {
+               free(sp->lines[i]->fields);
+               free(sp->lines[i]->text);
+       }
+
+       free(sp->lines);
+}
+
+static void
+initolist(struct outlist *olp)
+{
+       olp->ns = 0;
+       olp->maxs = 1;
+       olp->specs = ereallocarray(NULL, INIT, sizeof(struct spec *));
+}
+
+static void
+addspec(struct outlist *olp, struct spec *sp)
+{
+       if (olp->ns >= olp->maxs) {
+               olp->specs = ereallocarray(olp->specs, (GROW * olp->maxs),
+                       sizeof(struct spec *));
+               olp->maxs *= GROW;
+       }
+       olp->specs[olp->ns] = sp;
+       olp->ns++;
+}
+
+static struct spec *
+makespec(char *s)
+{
+       struct spec *sp;
+       int fileno;
+       size_t fldno;
+
+       switch (s[0]) {
+       case '0':         /* join field */
+               fileno = 0;
+               fldno = 0;
+               break;
+       case '1': case '2':
+               if (sscanf(s, "%d.%zu", &fileno, &fldno) != 2)
+                       eprintf("\"%s\": invalid format\n", s);
+               fldno--;     /* ugly */
+               break;
+       default:
+               eprintf("%c: invalid file number (must be 0, 1 or 2)\n", s[0]);
+               break;
+       }
+
+       sp = ereallocarray(NULL, INIT, sizeof(struct spec));
+       sp->fileno = fileno;
+       sp->fldno = fldno;
+       return sp;
+}
+
+static void
+makeolist(struct outlist *olp, char *s)
+{
+       char *item, *sp;
+       sp = s;
+
+       while (sp) {
+               item = sp;
+               sp = strpbrk(sp, ", \t");
+               if (sp)
+                       *sp++ = '\0';
+               addspec(olp, makespec(item));
+       }
+}
+
+static void
+freespecs(struct outlist *olp)
+{
+       size_t i;
+
+       for (i = 0; i < olp->ns; i++)
+               free(olp->specs[i]);
+}
+
+static void
+join(FILE *fa, FILE *fb, size_t jfa, size_t jfb)
+{
+       struct span spa, spb;
+       int cmp, eofa, eofb;
+
+       initspan(&spa);
+       initspan(&spb);
+       cmp = eofa = eofb = 0;
+
+       addtospan(&spa, fa, RESET);
+       addtospan(&spb, fb, RESET);
+
+       while (spa.nl && spb.nl) {
+               if ((cmp = linecmp(spa.lines[0], spb.lines[0], jfa, jfb)) < 0) {
+                       if (unpairsa)
+                               prline(spa.lines[0]);
+                       if (!addtospan(&spa, fa, RESET)) {
+                               if (unpairsb) {    /* a is EOF'd; print the 
rest of b */
+                                       do
+                                               prline(spb.lines[0]);
+                                       while (addtospan(&spb, fb, RESET));
+                               }
+                               eofa = eofb = 1;
+                       } else {
+                               continue;
+                       }
+               } else if (cmp > 0) {
+                       if (unpairsb)
+                               prline(spb.lines[0]);
+                       if (!addtospan(&spb, fb, RESET)) {
+                               if (unpairsa) {    /* b is EOF'd; print the 
rest of a */
+                                       do
+                                               prline(spa.lines[0]);
+                                       while (addtospan(&spa, fa, RESET));
+                               }
+                               eofa = eofb = 1;
+                       } else {
+                               continue;
+                       }
+               } else if (cmp == 0) {
+                       /* read all consecutive matching lines from a */
+                       do {
+                               if (!addtospan(&spa, fa, EXPAND)) {
+                                       eofa = 1;
+                                       spa.nl++;
+                                       break;
+                               }
+                       } while (linecmp(spa.lines[spa.nl-1], spb.lines[0], 
jfa, jfb) == 0);
+
+                       /* read all consecutive matching lines from b */
+                       do {
+                               if (!addtospan(&spb, fb, EXPAND)) {
+                                       eofb = 1;
+                                       spb.nl++;
+                                       break;
+                               }
+                       } while (linecmp(spa.lines[0], spb.lines[spb.nl-1], 
jfa, jfb) == 0);
+
+                       if (pairs)
+                               prspanjoin(&spa, &spb, jfa, jfb);
+
+               } else {      /* FIELD_ERROR: both lines lacked join fields */
+                       if (unpairsa)
+                               prline(spa.lines[0]);
+                       if (unpairsb)
+                               prline(spb.lines[0]);
+                       eofa = addtospan(&spa, fa, RESET) ? 0 : 1;
+                       eofb = addtospan(&spb, fb, RESET) ? 0 : 1;
+                       if (!eofa && !eofb)
+                               continue;
+               }
+
+               if (eofa) {
+                       spa.nl = 0;
+               } else {
+                       swaplines(spa.lines[0], spa.lines[spa.nl - 1]);   /* 
ugly */
+                       spa.nl = 1;
+               }
+
+               if (eofb) {
+                       spb.nl = 0;
+               } else {
+                       swaplines(spb.lines[0], spb.lines[spb.nl - 1]);   /* 
ugly */
+                       spb.nl = 1;
+               }
+       }
+       freespan(&spa);
+       freespan(&spb);
+}
+
+
+int
+main(int argc, char *argv[])
+{
+       size_t jf[2] = { jfield, jfield, };
+       FILE *fp[2];
+       int n;
+       char *fno;
+
+       ARGBEGIN {
+       case '1':
+               jf[0] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX));
+               break;
+       case '2':
+               jf[1] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX));
+               break;
+       case 'a':
+               fno = EARGF(usage());
+               if (strcmp(fno, "1") == 0)
+                       unpairsa = 1;
+               else if (strcmp(fno, "2") == 0)
+                       unpairsb = 1;
+               else
+                       usage();
+               break;
+       case 'e':
+               replace = EARGF(usage());
+               break;
+       case 'o':
+               oflag = 1;
+               initolist(&output);
+               makeolist(&output, EARGF(usage()));
+               break;
+       case 't':
+               sep = EARGF(usage());
+               break;
+       case 'v':
+               pairs = 0;
+               fno = EARGF(usage());
+               if (strcmp(fno, "1") == 0)
+                       unpairsa = 1;
+               else if (strcmp(fno, "2") == 0)
+                       unpairsb = 1;
+               else
+                       usage();
+               break;
+       default:
+               usage();
+       } ARGEND;
+
+       if (sep)
+               seplen = unescape(sep);
+
+       if (argc != 2)
+               usage();
+
+       for (n = 0; n < 2; n++) {
+               if (argv[n][0] == '-' && !argv[n][1]) {
+                       argv[n] = "<stdin>";
+                       fp[n] = stdin;
+               } else if (!(fp[n] = fopen(argv[n], "r"))) {
+                       eprintf("fopen %s:", argv[n]);
+               }
+       }
+
+       jf[0]--;
+       jf[1]--;
+
+       join(fp[0], fp[1], jf[0], jf[1]);
+
+       if (oflag)
+               freespecs(&output);
+
+       enfshut(2, fp[0], argv[0]);
+       if (fp[0] != fp[1])
+               enfshut(2, fp[1], argv[1]);
+       enfshut(2, stdout, "<stdout>");
+       exit(0);
+}

Reply via email to