Hello,

I have made an attempt at join(1). I hope it is worth a look.

See NOTES for some points on which this implementation differs from
other joins. The handling of -a | -v when used with -o is an issue.

Comments are welcome. Please be brutal.

Regards,

--
Wolfgang Corcoran-Mathe
/* Notes:                                                       *
 *                                                              *
 * Input file order is not checked.                             *
 *                                                              *
 * Separators specified with -t may be multiple characters.     *
 *                                                              *
 * Two lines that both lack join fields are not matched. Thus,  *
 * 'join -1 3 -2 3' will consider the lines 'don quixote' and   *
 * 'don juan' unpairable. (GNU join matches these lines).       *
 *                                                              *
 * Unpairable lines (when selected with the -a or -v flags)     *
 * currently ignore any formatting specified with -o. This is   *
 * non-POSIX compliant, but I have not seen any joins that      *
 * handle cases like '-v 1 -v 2 -o "0 1.3 2.1"' intelligently.  *
 * (GNU join produces especially bizarre output when the above  *
 * format/inversion is used on shorter lines.) Suggestions are  *
 * welcome.                                                     */

#include <ctype.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "arg.h"
#include "text.h"
#include "utf.h"
#include "util.h"

#undef MIN
#define MIN(x,y)  ((x) < (y) ? (x) : (y))

#undef MAX
#define MAX(x,y)  ((x) > (y) ? (x) : (y))

enum {
        INIT = 1,
        GROW = 2
};

enum {
        EXPAND = 0,
        RESET  = 1
};

enum { FIELD_ERROR = -2 };

struct field {
        char *s;
        size_t len;
};

struct line {
        char *text;
        size_t nf;
        size_t maxf;
        struct field *fields;
};

struct spec {
        size_t fileno;
        size_t fldno;
};

struct outlist {
        size_t ns;
        size_t maxs;
        struct spec **specs;
};

struct span {
        size_t nl;
        size_t maxl;
        struct line **lines;
};

static char *sep = NULL;
static char *replace = NULL;
static const char defaultfs = ' ';
static const int jfield = 1;            /* POSIX default join field */
static int unpairsa = 0, unpairsb = 0;
static int oflag = 0;
static int pairs = 1;
static size_t seplen;
static struct outlist output;

char *argv0;


static void usage(void)
{
        eprintf("usage: %s [-1 field] [-2 field] [-o list] [-e string] "
                        "[-a | -v fileno] [-t delim] file1 file2\n", argv0);
}

static void prfield(struct field *fp)
{
        if (fwrite(fp->s, 1, fp->len, stdout) != fp->len)
                eprintf("fwrite:");
}

static void swaplines(struct line *la, struct line *lb)
{
        struct line tmp;

        tmp = *la;
        *la = *lb;
        *lb = tmp;
}

static void prjoin(struct line *la, struct line *lb, size_t jfa, size_t jfb)
{
        struct spec *sp;
        struct field *joinfield;
        size_t i;

        if (jfa >= la->nf || jfb >= lb->nf)
                return;

        joinfield = &la->fields[jfa];

        if (oflag) {
                for (i = 0; i < output.ns; i++) {
                        sp = output.specs[i];

                        if (sp->fileno == 1) {
                                if (sp->fldno < la->nf)
                                        prfield(&la->fields[sp->fldno]);
                                else if (replace)
                                        fputs(replace, stdout);
                        } else if (sp->fileno == 2) {
                                if (sp->fldno < lb->nf)
                                        prfield(&lb->fields[sp->fldno]);
                                else if (replace)
                                        fputs(replace, stdout);
                        } else if (sp->fileno == 0) {
                                prfield(joinfield);
                        }

                        if (i < output.ns - 1) {
                                if (sep) {
                                        fwrite(sep, 1, seplen, stdout);
                                } else {
                                        putchar(defaultfs);
                                }
                        }
                }
        } else {
                prfield(joinfield);

                for (i = 0; i < la->nf; i++)
                        if (i != jfa) {
                                if (sep)
                                        fwrite(sep, 1, seplen, stdout);
                                else
                                        putchar(defaultfs);
                                prfield(&la->fields[i]);
                        }
                for (i = 0; i < lb->nf; i++)
                        if (i != jfb) {
                                if (sep)
                                        fwrite(sep, 1, seplen, stdout);
                                else
                                        putchar(defaultfs);
                                prfield(&lb->fields[i]);
                        }
        }

        putchar('\n');
}

static void prline(struct line *lp)
{
        size_t len = strlen(lp->text);

        if (fwrite(lp->text, 1, len, stdout) != len)
                eprintf("fwrite \"%.20s\":", lp->text);
        putchar('\n');
}

static int linecmp(struct line *la, struct line *lb, size_t jfa, size_t jfb)
{
        int status;

        /* When both lines are short (i.e. lacking join fields), GNU join *
         * considers them a match. We return FIELD_ERROR for these lines. */
        if (jfa >= la->nf)
                status = jfb >= lb->nf ? FIELD_ERROR : -1;
        else if (jfb >= lb->nf)
                return 1;
        else {
                status = memcmp(la->fields[jfa].s, lb->fields[jfb].s,
                MAX (la->fields[jfa].len, lb->fields[jfb].len));
                if (status > 0)
                        status = 1;
                else if (status < 0)
                        status = -1;
        }

        return status;
}

static void addfield(struct line *lp, char *sp, size_t len)
{
        if (lp->nf >= lp->maxf) {
                lp->fields = ereallocarray(lp->fields, (GROW * lp->maxf),
                        sizeof(struct field));
                lp->maxf *= GROW;
        }
        lp->fields[lp->nf].s = sp;
        lp->fields[lp->nf].len = len;
        lp->nf++;
}

static void prspanjoin(struct span *spa, struct span *spb, size_t jfa,
        size_t jfb)
{
        size_t i, j;

        for (i = 0; i < (spa->nl - 1); i++)
                for (j = 0; j < (spb->nl - 1); j++)
                        prjoin(spa->lines[i], spb->lines[j], jfa, jfb);
}

static struct line *makeline(char *s)
{
        struct line *lp;
        char *sp, *beg, *end;
        size_t i;
        int eol = 0;

        if (s[strlen(s)-1] == '\n')
                s[strlen(s)-1] = '\0';

        lp = (struct line *) emalloc(sizeof(struct line));
        lp->text = s;
        lp->fields = (struct field *) emalloc(INIT * sizeof(struct field));
        lp->nf = 0;
        lp->maxf = INIT;

        for (sp = lp->text; isblank(*sp); sp++)
                ;

        while (!eol) {
                beg = sp;

                if (sep) {
                        if (!(end = utfutf(sp, sep)))
                                eol = 1;

                        if (!eol) {
                                addfield(lp, beg, end - beg);
                                for (i = 0; i < seplen; i++)
                                        end++;
                        }
                } else {
                        for (end = sp; !(isblank(*end)); end++)
                                if (*end == '\0') {
                                        eol = 1;
                                        break;
                                }

                        if (!eol)
                                addfield(lp, beg, end - beg);
                        while (isblank(*++end))
                                ;
                }

                if (eol)
                        addfield(lp, beg, strlen(sp));

                sp = end;
        }

        return lp;
}

static int addtospan(struct span *sp, FILE *fp, int reset)
{
        char *newl = NULL;
        size_t size = 0;

        if (getline(&newl, &size, fp) == -1) {
                if (ferror(fp)) {
                        eprintf("ferror:");
                } else {
                        return 0;
                }
        }

        if (reset)
                sp->nl = 0;

        if (sp->nl >= sp->maxl) {
                sp->lines = ereallocarray(sp->lines, (GROW * sp->maxl),
                        sizeof(struct line *));
                sp->maxl *= GROW;
        }

        sp->lines[sp->nl] = makeline(newl);
        sp->nl++;
        return 1;
}

static void initspan(struct span *sp)
{
        sp->nl = 0;
        sp->maxl = INIT;
        sp->lines = (struct line **) emalloc(INIT * sizeof(struct line *));;
}

static void freespan(struct span *sp)
{
        size_t i;

        for (i = 0; i < sp->nl; i++) {
                free(sp->lines[i]->fields);
                free(sp->lines[i]->text);
        }

        free(sp->lines);
}

static void initolist(struct outlist *olp)
{
        olp->ns = 0;
        olp->maxs = 1;
        olp->specs = (struct spec **) emalloc(INIT * sizeof(struct spec *));
}

static void addspec(struct outlist *olp, struct spec *sp)
{
        if (olp->ns >= olp->maxs) {
                olp->specs = ereallocarray(olp->specs, (GROW * olp->maxs),
                        sizeof(struct spec *));
                olp->maxs *= GROW;
        }
        olp->specs[olp->ns] = sp;
        olp->ns++;
}

static struct spec *makespec(char *s)
{
        struct spec *sp;
        int fileno;
        size_t fldno;

        switch (s[0]) {
        case '0':         /* join field */
                fileno = 0;
                fldno = 0;
                break;
        case '1':
        case '2':
                if (sscanf(s, "%d.%zu", &fileno, &fldno) != 2)
                        eprintf("\"%s\": invalid format\n", s);
                fldno--;     /* ugly */
                break;
        default:
                eprintf("%c: invalid file number (must be 0, 1 or 2)\n", s[0]);
                break;
        }

        sp = (struct spec *) emalloc(sizeof(struct spec));
        sp->fileno = fileno;
        sp->fldno = fldno;
        return sp;
}

static void makeolist(struct outlist *olp, char *s)
{
        char *item, *sp;
        sp = s;

        while (sp) {
                item = sp;
                sp = strpbrk(sp, ", \t");
                if (sp)
                        *sp++ = '\0';
                addspec(olp, makespec(item));
        }
}

static void freespecs(struct outlist *olp)
{
        size_t i;

        for (i = 0; i < olp->ns; i++)
                free(olp->specs[i]);
}

static void join(FILE *fa, FILE *fb, size_t jfa, size_t jfb)
{
        struct span spa, spb;
        int cmp, eofa, eofb;

        initspan(&spa);
        initspan(&spb);
        cmp = eofa = eofb = 0;

        addtospan(&spa, fa, RESET);
        addtospan(&spb, fb, RESET);

        while (spa.nl && spb.nl) {
                if ((cmp = linecmp(spa.lines[0], spb.lines[0], jfa, jfb)) < 0) {
                        if (unpairsa)
                                prline(spa.lines[0]);
                        if (!addtospan(&spa, fa, RESET)) {
                                if (unpairsb) {    /* a is EOF'd; print the 
rest of b */
                                        do {
                                                prline(spb.lines[0]);
                                        } while (addtospan(&spb, fb, RESET));
                                }
                                eofa = eofb = 1;
                        } else {
                                continue;
                        }
                } else if (cmp > 0) {
                        if (unpairsb)
                                prline(spb.lines[0]);
                        if (!addtospan(&spb, fb, RESET)) {
                                if (unpairsa) {    /* b is EOF'd; print the 
rest of a */
                                        do {
                                                prline(spa.lines[0]);
                                        } while (addtospan(&spa, fa, RESET));
                                }
                                eofa = eofb = 1;
                        } else {
                                continue;
                        }
                } else if (cmp == 0) {
                        /* read all consecutive matching lines from a */
                        do {
                                if (!addtospan(&spa, fa, EXPAND)) {
                                        eofa = 1;
                                        spa.nl++;
                                        break;
                                }
                        } while (linecmp(spa.lines[spa.nl-1], spb.lines[0], 
jfa, jfb) == 0);

                        /* read all consecutive matching lines from b */
                        do {
                                if (!addtospan(&spb, fb, EXPAND)) {
                                        eofb = 1;
                                        spb.nl++;
                                        break;
                                }
                        } while (linecmp(spa.lines[0], spb.lines[spb.nl-1], 
jfa, jfb) == 0);

                        if (pairs)
                                prspanjoin(&spa, &spb, jfa, jfb);

                } else {      /* FIELD_ERROR: both lines lacked join fields */
                        if (unpairsa)
                                prline(spa.lines[0]);
                        if (unpairsb)
                                prline(spb.lines[0]);
                        eofa = addtospan(&spa, fa, RESET) ? 0 : 1;
                        eofb = addtospan(&spb, fb, RESET) ? 0 : 1;
                        if (!eofa && !eofb)
                                continue;
                }

                if (eofa) {
                        spa.nl = 0;
                } else {
                        swaplines(spa.lines[0], spa.lines[spa.nl - 1]);   /* 
ugly */
                        spa.nl = 1;
                }

                if (eofb) {
                        spb.nl = 0;
                } else {
                        swaplines(spb.lines[0], spb.lines[spb.nl - 1]);   /* 
ugly */
                        spb.nl = 1;
                }
        }
        freespan(&spa);
        freespan(&spb);
}


int main(int argc, char *argv[])
{
        size_t jf[2] = { jfield, jfield };
        FILE *fp[2];
        int n;
        char *fno;

        ARGBEGIN {
        case '1':
                jf[0] = estrtonum(EARGF(usage()), 0, MIN(LLONG_MAX, SIZE_MAX));
                break;
        case '2':
                jf[1] = estrtonum(EARGF(usage()), 0, MIN(LLONG_MAX, SIZE_MAX));
                break;
        case 'a':
                fno = EARGF(usage());
                if (strcmp(fno, "1") == 0)
                        unpairsa = 1;
                else if (strcmp(fno, "2") == 0)
                        unpairsb = 1;
                else
                        usage();
                break;
        case 'e':
                replace = EARGF(usage());
                break;
        case 'o':
                oflag = 1;
                initolist(&output);
                makeolist(&output, EARGF(usage()));
                break;
        case 't':
                sep = EARGF(usage());
                break;
        case 'v':
                pairs = 0;
                fno = EARGF(usage());
                if (strcmp(fno, "1") == 0)
                        unpairsa = 1;
                else if (strcmp(fno, "2") == 0)
                        unpairsb = 1;
                else
                        usage();
                break;
        default:
                usage();
        } ARGEND;

        if (sep)
                seplen = unescape(sep);

        if (argc != 2)
                usage();

        for (n = 0; n < 2; n++) {
                if (argv[n][0] == '-' && !argv[n][1]) {
                        argv[n] = "<stdin>";
                        fp[n] = stdin;
                } else {
                        if (!(fp[n] = fopen(argv[n], "r")))
                                eprintf("fopen %s:", argv[n]);
                }
        }

        jf[0]--;
        jf[1]--;

        join(fp[0], fp[1], jf[0], jf[1]);

        if (oflag)
                freespecs(&output);

        enfshut(2, fp[0], argv[0]);
        if (fp[0] != fp[1])
                enfshut(2, fp[1], argv[1]);
        enfshut(2, stdout, "<stdout>");
        exit(0);
}

Reply via email to