On Sat, Oct 06, 2018 at 08:57:35PM +0200, Andreas Kusalananda Kähäri wrote:
> >Synopsis: Wrong result in right and full outer join with join(1) > >Category: system > >Environment: > System : OpenBSD 6.3 > Details : OpenBSD 6.3 (GENERIC.MP) #11: Thu Sep 20 16:05:37 CEST > 2018 > > [email protected]:/usr/src/sys/arch/amd64/compile/GENERIC.MP > > Architecture: OpenBSD.amd64 > Machine : amd64 > >Description: > > The join(1) utility produces the wrong result when performing a right > join or a full outer join between two files under some circumstances. > > >How-To-Repeat: > > $ cat test1.input > 1,A1,B1 > 2 > 3,E1,F1 > > (it makes no difference if the second line reads "2,," or "2,C1," or > "2,C1,D1" above) > > $ cat test2.input > 1,A2,B2 > 3,E2,F2 > > (none of the files has an empty line at the end, that's only added for > readability here) > > Doing a left join (using -a1) works as expected: > $ join -t, -o0,1.2,1.3,2.2,2.3 -a1 test1.input test2.input > 1,A1,B1,A2,B2 > 2,,,, > 3,E1,F1,E2,F2 > > Doing a right join (using -a2): > $ join -t, -o0,1.2,1.3,2.2,2.3 -a2 test1.input test2.input > 1,A1,B1,A2,B2 > 3,,,E2,F2 > > The data from the first file is dropped even though there should be a > match on its "3" line. > > The expected result is > 1,A1,B1,A2,B2 > 3,E1,F1,E2,F2 > > Doing a full outer join: > $ join -t, -o0,1.2,1.3,2.2,2.3 -a1 -a2 test1.input test2.input > 1,A1,B1,A2,B2 > 2,,,, > 3,,,E2,F2 > 3,E1,F1,, > > Again, the "3" line in the two files don't appear to get matched up. > > The expected result is > 1,A1,B1,A2,B2 > 2,,,, > 3,E1,F1,E2,F2 > > Doing the join with the two files swapped around works in all cases: > > Left: > $ join -t, -o0,1.2,1.3,2.2,2.3 -a1 test2.input test1.input > 1,A2,B2,A1,B1 > 3,E2,F2,E1,F1 > > Right: > $ join -t, -o0,1.2,1.3,2.2,2.3 -a2 test2.input test1.input > 1,A2,B2,A1,B1 > 2,,,, > 3,E2,F2,E1,F1 > > Full outer: > $ join -t, -o0,1.2,1.3,2.2,2.3 -a1 -a2 test2.input test1.input > 1,A2,B2,A1,B1 > 2,,,, > 3,E2,F2,E1,F1 > > Also, inner joins (without using -a at all) works as it should. > > $ hexdump -C test1.input > 00000000 31 2c 41 31 2c 42 31 0a 32 2c 43 31 2c 44 31 0a |1,A1,B1.2,C1,D1.| > 00000010 33 2c 45 31 2c 46 31 0a |3,E1,F1.| > 00000018 > $ hexdump -C test2.input > 00000000 31 2c 41 32 2c 42 32 0a 33 2c 45 32 2c 46 32 0a |1,A2,B2.3,E2,F2.| > 00000010 Hi, This diff fixes your case. In the past, we cherry picked some changes from FreeBSD, but for some reason the main logic was left alone, while our version was overly complex. FreeBSDs main loop is more simple and easy to understand. This need carefull scrutiny, since an error is easily made and I mostly did this without enough coffee. -Otto Index: join.c =================================================================== RCS file: /cvs/src/usr.bin/join/join.c,v retrieving revision 1.28 diff -u -p -r1.28 join.c --- join.c 18 Jul 2018 17:20:54 -0000 1.28 +++ join.c 7 Oct 2018 07:58:02 -0000 @@ -53,8 +53,6 @@ typedef struct { char **fields; /* line field(s) */ u_long fieldcnt; /* line field(s) count */ u_long fieldalloc; /* line field(s) allocated count */ - u_long cfieldc; /* current field count */ - long fpos; /* fpos of start of field */ } LINE; typedef struct { @@ -67,10 +65,9 @@ typedef struct { u_long pushback; /* line on the stack */ u_long setcnt; /* set count */ u_long setalloc; /* set allocated count */ - u_long setusedc; /* sets used */ } INPUT; -INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0, 0 }, - input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0, 0 }; +INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0 }, + input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0 }; typedef struct { u_long filenum; /* file number */ @@ -214,12 +211,8 @@ main(int argc, char *argv[]) if (pledge("stdio", NULL) == -1) err(1, "pledge"); - F1->setusedc = 0; - F2->setusedc = 0; slurp(F1); slurp(F2); - F1->set->cfieldc = 0; - F2->set->cfieldc = 0; /* * We try to let the files have the same field value, advancing @@ -234,24 +227,16 @@ main(int argc, char *argv[]) joinlines(F1, F2); slurp(F1); slurp(F2); - } - else { - if (F1->unpair - && (cval < 0 || F2->set->cfieldc == F2->setusedc -1)) { + } else if (cval < 0) { + /* File 1 takes the lead... */ + if (F1->unpair) joinlines(F1, NULL); - slurp(F1); - } - else if (cval < 0) - /* File 1 takes the lead... */ - slurp(F1); - if (F2->unpair - && (cval > 0 || F1->set->cfieldc == F1->setusedc -1)) { + slurp(F1); + } else { + /* File 2 takes the lead... */ + if (F2->unpair) joinlines(F2, NULL); - slurp(F2); - } - else if (cval > 0) - /* File 2 takes the lead... */ - slurp(F2); + slurp(F2); } } @@ -273,36 +258,15 @@ main(int argc, char *argv[]) return 0; } -/* wrapper around slurpit() to keep track of what field we are on */ -void slurp(INPUT *F) -{ - long fpos; - u_long cfieldc; - - if (F->set == NULL) { - fpos = 0; - cfieldc = 0; - } - else { - fpos = F->set->fpos; - cfieldc = F->set->cfieldc; - } - slurpit(F); - if (F->set == NULL) - return; - else if (fpos != F->set->fpos) - F->set->cfieldc = cfieldc+1; -} - void -slurpit(INPUT *F) +slurp(INPUT *F) { LINE *lp, *lastlp, tmp; ssize_t len; size_t linesize; u_long cnt; char *bp, *fieldp, *line; - long fpos; + /* * Read all of the lines from an input file that have the same * join field. @@ -311,7 +275,7 @@ slurpit(INPUT *F) F->setcnt = 0; line = NULL; linesize = 0; - for (lastlp = NULL; ; ++F->setcnt, lastlp = lp) { + for (lastlp = NULL; ; ++F->setcnt) { /* * If we're out of space to hold line structures, allocate * more. Initialize the structure so that we know that this @@ -339,6 +303,8 @@ slurpit(INPUT *F) * level of indirection, but it's probably okay as is. */ lp = &F->set[F->setcnt]; + if (F->setcnt) + lastlp = &F->set[F->setcnt - 1]; if (F->pushbool) { tmp = F->set[F->setcnt]; F->set[F->setcnt] = F->set[F->pushback]; @@ -348,11 +314,6 @@ slurpit(INPUT *F) } if ((len = getline(&line, &linesize, F->fp)) == -1) break; - /* - * we depend on knowing on what field we are, one safe way is - * the file position. - */ - fpos = ftell(F->fp) - len; /* Remove trailing newline, if it exists, and copy line. */ if (line[len - 1] == '\n') @@ -366,10 +327,8 @@ slurpit(INPUT *F) lp->line = p; lp->linealloc = newsize; } - F->setusedc++; memcpy(lp->line, line, len); lp->line[len] = '\0'; - lp->fpos = fpos; /* Split the line into fields, allocate space as necessary. */ lp->fieldcnt = 0;
