Any developer wants to ok this?
-Otto
On Sun, Oct 07, 2018 at 10:08:08AM +0200, Otto Moerbeek wrote:
> On Sat, Oct 06, 2018 at 08:57:35PM +0200, Andreas Kusalananda Kähäri wrote:
>
> > >Synopsis: Wrong result in right and full outer join with join(1)
> > >Category: system
> > >Environment:
> > System : OpenBSD 6.3
> > Details : OpenBSD 6.3 (GENERIC.MP) #11: Thu Sep 20 16:05:37
> > CEST 2018
> >
> > [email protected]:/usr/src/sys/arch/amd64/compile/GENERIC.MP
> >
> > Architecture: OpenBSD.amd64
> > Machine : amd64
> > >Description:
> >
> > The join(1) utility produces the wrong result when performing a right
> > join or a full outer join between two files under some circumstances.
> >
> > >How-To-Repeat:
> >
> > $ cat test1.input
> > 1,A1,B1
> > 2
> > 3,E1,F1
> >
> > (it makes no difference if the second line reads "2,," or "2,C1," or
> > "2,C1,D1" above)
> >
> > $ cat test2.input
> > 1,A2,B2
> > 3,E2,F2
> >
> > (none of the files has an empty line at the end, that's only added for
> > readability here)
> >
> > Doing a left join (using -a1) works as expected:
> > $ join -t, -o0,1.2,1.3,2.2,2.3 -a1 test1.input test2.input
> > 1,A1,B1,A2,B2
> > 2,,,,
> > 3,E1,F1,E2,F2
> >
> > Doing a right join (using -a2):
> > $ join -t, -o0,1.2,1.3,2.2,2.3 -a2 test1.input test2.input
> > 1,A1,B1,A2,B2
> > 3,,,E2,F2
> >
> > The data from the first file is dropped even though there should be a
> > match on its "3" line.
> >
> > The expected result is
> > 1,A1,B1,A2,B2
> > 3,E1,F1,E2,F2
> >
> > Doing a full outer join:
> > $ join -t, -o0,1.2,1.3,2.2,2.3 -a1 -a2 test1.input test2.input
> > 1,A1,B1,A2,B2
> > 2,,,,
> > 3,,,E2,F2
> > 3,E1,F1,,
> >
> > Again, the "3" line in the two files don't appear to get matched up.
> >
> > The expected result is
> > 1,A1,B1,A2,B2
> > 2,,,,
> > 3,E1,F1,E2,F2
> >
> > Doing the join with the two files swapped around works in all cases:
> >
> > Left:
> > $ join -t, -o0,1.2,1.3,2.2,2.3 -a1 test2.input test1.input
> > 1,A2,B2,A1,B1
> > 3,E2,F2,E1,F1
> >
> > Right:
> > $ join -t, -o0,1.2,1.3,2.2,2.3 -a2 test2.input test1.input
> > 1,A2,B2,A1,B1
> > 2,,,,
> > 3,E2,F2,E1,F1
> >
> > Full outer:
> > $ join -t, -o0,1.2,1.3,2.2,2.3 -a1 -a2 test2.input test1.input
> > 1,A2,B2,A1,B1
> > 2,,,,
> > 3,E2,F2,E1,F1
> >
> > Also, inner joins (without using -a at all) works as it should.
> >
> > $ hexdump -C test1.input
> > 00000000 31 2c 41 31 2c 42 31 0a 32 2c 43 31 2c 44 31 0a
> > |1,A1,B1.2,C1,D1.|
> > 00000010 33 2c 45 31 2c 46 31 0a |3,E1,F1.|
> > 00000018
> > $ hexdump -C test2.input
> > 00000000 31 2c 41 32 2c 42 32 0a 33 2c 45 32 2c 46 32 0a
> > |1,A2,B2.3,E2,F2.|
> > 00000010
>
> Hi,
>
> This diff fixes your case. In the past, we cherry picked some changes
> from FreeBSD, but for some reason the main logic was left alone, while
> our version was overly complex. FreeBSDs main loop is more simple and
> easy to understand.
>
> This need carefull scrutiny, since an error is easily made and I mostly
> did this without enough coffee.
>
> -Otto
>
> Index: join.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/join/join.c,v
> retrieving revision 1.28
> diff -u -p -r1.28 join.c
> --- join.c 18 Jul 2018 17:20:54 -0000 1.28
> +++ join.c 7 Oct 2018 07:58:02 -0000
> @@ -53,8 +53,6 @@ typedef struct {
> char **fields; /* line field(s) */
> u_long fieldcnt; /* line field(s) count */
> u_long fieldalloc; /* line field(s) allocated count */
> - u_long cfieldc; /* current field count */
> - long fpos; /* fpos of start of field */
> } LINE;
>
> typedef struct {
> @@ -67,10 +65,9 @@ typedef struct {
> u_long pushback; /* line on the stack */
> u_long setcnt; /* set count */
> u_long setalloc; /* set allocated count */
> - u_long setusedc; /* sets used */
> } INPUT;
> -INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0, 0 },
> - input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0, 0 };
> +INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0 },
> + input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0 };
>
> typedef struct {
> u_long filenum; /* file number */
> @@ -214,12 +211,8 @@ main(int argc, char *argv[])
> if (pledge("stdio", NULL) == -1)
> err(1, "pledge");
>
> - F1->setusedc = 0;
> - F2->setusedc = 0;
> slurp(F1);
> slurp(F2);
> - F1->set->cfieldc = 0;
> - F2->set->cfieldc = 0;
>
> /*
> * We try to let the files have the same field value, advancing
> @@ -234,24 +227,16 @@ main(int argc, char *argv[])
> joinlines(F1, F2);
> slurp(F1);
> slurp(F2);
> - }
> - else {
> - if (F1->unpair
> - && (cval < 0 || F2->set->cfieldc == F2->setusedc -1)) {
> + } else if (cval < 0) {
> + /* File 1 takes the lead... */
> + if (F1->unpair)
> joinlines(F1, NULL);
> - slurp(F1);
> - }
> - else if (cval < 0)
> - /* File 1 takes the lead... */
> - slurp(F1);
> - if (F2->unpair
> - && (cval > 0 || F1->set->cfieldc == F1->setusedc -1)) {
> + slurp(F1);
> + } else {
> + /* File 2 takes the lead... */
> + if (F2->unpair)
> joinlines(F2, NULL);
> - slurp(F2);
> - }
> - else if (cval > 0)
> - /* File 2 takes the lead... */
> - slurp(F2);
> + slurp(F2);
> }
> }
>
> @@ -273,36 +258,15 @@ main(int argc, char *argv[])
> return 0;
> }
>
> -/* wrapper around slurpit() to keep track of what field we are on */
> -void slurp(INPUT *F)
> -{
> - long fpos;
> - u_long cfieldc;
> -
> - if (F->set == NULL) {
> - fpos = 0;
> - cfieldc = 0;
> - }
> - else {
> - fpos = F->set->fpos;
> - cfieldc = F->set->cfieldc;
> - }
> - slurpit(F);
> - if (F->set == NULL)
> - return;
> - else if (fpos != F->set->fpos)
> - F->set->cfieldc = cfieldc+1;
> -}
> -
> void
> -slurpit(INPUT *F)
> +slurp(INPUT *F)
> {
> LINE *lp, *lastlp, tmp;
> ssize_t len;
> size_t linesize;
> u_long cnt;
> char *bp, *fieldp, *line;
> - long fpos;
> +
> /*
> * Read all of the lines from an input file that have the same
> * join field.
> @@ -311,7 +275,7 @@ slurpit(INPUT *F)
> F->setcnt = 0;
> line = NULL;
> linesize = 0;
> - for (lastlp = NULL; ; ++F->setcnt, lastlp = lp) {
> + for (lastlp = NULL; ; ++F->setcnt) {
> /*
> * If we're out of space to hold line structures, allocate
> * more. Initialize the structure so that we know that this
> @@ -339,6 +303,8 @@ slurpit(INPUT *F)
> * level of indirection, but it's probably okay as is.
> */
> lp = &F->set[F->setcnt];
> + if (F->setcnt)
> + lastlp = &F->set[F->setcnt - 1];
> if (F->pushbool) {
> tmp = F->set[F->setcnt];
> F->set[F->setcnt] = F->set[F->pushback];
> @@ -348,11 +314,6 @@ slurpit(INPUT *F)
> }
> if ((len = getline(&line, &linesize, F->fp)) == -1)
> break;
> - /*
> - * we depend on knowing on what field we are, one safe way is
> - * the file position.
> - */
> - fpos = ftell(F->fp) - len;
>
> /* Remove trailing newline, if it exists, and copy line. */
> if (line[len - 1] == '\n')
> @@ -366,10 +327,8 @@ slurpit(INPUT *F)
> lp->line = p;
> lp->linealloc = newsize;
> }
> - F->setusedc++;
> memcpy(lp->line, line, len);
> lp->line[len] = '\0';
> - lp->fpos = fpos;
>
> /* Split the line into fields, allocate space as necessary. */
> lp->fieldcnt = 0;