Any developer wants to ok this?

        -Otto

On Sun, Oct 07, 2018 at 10:08:08AM +0200, Otto Moerbeek wrote:

> On Sat, Oct 06, 2018 at 08:57:35PM +0200, Andreas Kusalananda Kähäri wrote:
> 
> > >Synopsis:      Wrong result in right and full outer join with join(1)
> > >Category:      system
> > >Environment:
> >         System      : OpenBSD 6.3
> >         Details     : OpenBSD 6.3 (GENERIC.MP) #11: Thu Sep 20 16:05:37 
> > CEST 2018
> >                          
> > [email protected]:/usr/src/sys/arch/amd64/compile/GENERIC.MP
> > 
> >         Architecture: OpenBSD.amd64
> >         Machine     : amd64
> > >Description:
> > 
> > The join(1) utility produces the wrong result when performing a right
> > join or a full outer join between two files under some circumstances.
> > 
> > >How-To-Repeat:
> > 
> > $ cat test1.input
> > 1,A1,B1
> > 2
> > 3,E1,F1
> > 
> > (it makes no difference if the second line reads "2,," or "2,C1," or
> > "2,C1,D1" above)
> > 
> > $ cat test2.input
> > 1,A2,B2
> > 3,E2,F2
> > 
> > (none of the files has an empty line at the end, that's only added for
> > readability here)
> > 
> > Doing a left join (using -a1) works as expected:
> > $ join -t, -o0,1.2,1.3,2.2,2.3 -a1 test1.input test2.input
> > 1,A1,B1,A2,B2
> > 2,,,,
> > 3,E1,F1,E2,F2
> > 
> > Doing a right join (using -a2):
> > $ join -t, -o0,1.2,1.3,2.2,2.3 -a2 test1.input test2.input
> > 1,A1,B1,A2,B2
> > 3,,,E2,F2
> > 
> > The data from the first file is dropped even though there should be a
> > match on its "3" line.
> > 
> > The expected result is
> > 1,A1,B1,A2,B2
> > 3,E1,F1,E2,F2
> > 
> > Doing a full outer join:
> > $ join -t, -o0,1.2,1.3,2.2,2.3 -a1 -a2 test1.input test2.input
> > 1,A1,B1,A2,B2
> > 2,,,,
> > 3,,,E2,F2
> > 3,E1,F1,,
> > 
> > Again, the "3" line in the two files don't appear to get matched up.
> > 
> > The expected result is
> > 1,A1,B1,A2,B2
> > 2,,,,
> > 3,E1,F1,E2,F2
> > 
> > Doing the join with the two files swapped around works in all cases:
> > 
> > Left:
> > $ join -t, -o0,1.2,1.3,2.2,2.3 -a1 test2.input test1.input
> > 1,A2,B2,A1,B1
> > 3,E2,F2,E1,F1
> > 
> > Right:
> > $ join -t, -o0,1.2,1.3,2.2,2.3 -a2 test2.input test1.input
> > 1,A2,B2,A1,B1
> > 2,,,,
> > 3,E2,F2,E1,F1
> > 
> > Full outer:
> > $ join -t, -o0,1.2,1.3,2.2,2.3 -a1 -a2 test2.input test1.input
> > 1,A2,B2,A1,B1
> > 2,,,,
> > 3,E2,F2,E1,F1
> > 
> > Also, inner joins (without using -a at all) works as it should.
> > 
> > $ hexdump -C test1.input
> > 00000000  31 2c 41 31 2c 42 31 0a  32 2c 43 31 2c 44 31 0a  
> > |1,A1,B1.2,C1,D1.|
> > 00000010  33 2c 45 31 2c 46 31 0a                           |3,E1,F1.|
> > 00000018
> > $ hexdump -C test2.input
> > 00000000  31 2c 41 32 2c 42 32 0a  33 2c 45 32 2c 46 32 0a  
> > |1,A2,B2.3,E2,F2.|
> > 00000010
> 
> Hi,
> 
> This diff fixes your case. In the past, we cherry picked some changes
> from FreeBSD, but for some reason the main logic was left alone, while
> our version was overly complex. FreeBSDs main loop is more simple and
> easy to understand.
> 
> This need carefull scrutiny, since an error is easily made and I mostly
> did this without enough coffee.
> 
>       -Otto
> 
> Index: join.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/join/join.c,v
> retrieving revision 1.28
> diff -u -p -r1.28 join.c
> --- join.c    18 Jul 2018 17:20:54 -0000      1.28
> +++ join.c    7 Oct 2018 07:58:02 -0000
> @@ -53,8 +53,6 @@ typedef struct {
>       char **fields;                  /* line field(s) */
>       u_long fieldcnt;                /* line field(s) count */
>       u_long fieldalloc;              /* line field(s) allocated count */
> -     u_long cfieldc;                 /* current field count */
> -     long     fpos;                  /* fpos of start of field */
>  } LINE;
>  
>  typedef struct {
> @@ -67,10 +65,9 @@ typedef struct {
>       u_long pushback;                /* line on the stack */
>       u_long setcnt;                  /* set count */
>       u_long setalloc;                /* set allocated count */
> -     u_long setusedc;                /* sets used  */
>  } INPUT;
> -INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0, 0 },
> -      input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0, 0 };
> +INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0 },
> +      input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0 };
>  
>  typedef struct {
>       u_long  filenum;        /* file number */
> @@ -214,12 +211,8 @@ main(int argc, char *argv[])
>       if (pledge("stdio", NULL) == -1)
>               err(1, "pledge");
>  
> -     F1->setusedc = 0;
> -     F2->setusedc = 0;
>       slurp(F1);
>       slurp(F2);
> -     F1->set->cfieldc = 0;
> -     F2->set->cfieldc = 0;
>  
>       /*
>        * We try to let the files have the same field value, advancing
> @@ -234,24 +227,16 @@ main(int argc, char *argv[])
>                               joinlines(F1, F2);
>                       slurp(F1);
>                       slurp(F2);
> -             }
> -             else {
> -                     if (F1->unpair
> -                     && (cval < 0 || F2->set->cfieldc == F2->setusedc -1)) {
> +             } else if (cval < 0) {
> +                     /* File 1 takes the lead... */
> +                     if (F1->unpair)
>                               joinlines(F1, NULL);
> -                             slurp(F1);
> -                     }
> -                     else if (cval < 0)
> -                             /* File 1 takes the lead... */
> -                             slurp(F1);
> -                     if (F2->unpair
> -                     && (cval > 0 || F1->set->cfieldc == F1->setusedc -1)) {
> +                     slurp(F1);
> +             } else {
> +                     /* File 2 takes the lead... */
> +                     if (F2->unpair)
>                               joinlines(F2, NULL);
> -                             slurp(F2);
> -                     }
> -                     else if (cval > 0)
> -                             /* File 2 takes the lead... */
> -                             slurp(F2);
> +                     slurp(F2);
>               }
>       }
>  
> @@ -273,36 +258,15 @@ main(int argc, char *argv[])
>       return 0;
>  }
>  
> -/* wrapper around slurpit() to keep track of what field we are on */
> -void slurp(INPUT *F)
> -{
> -     long fpos;
> -     u_long cfieldc;
> -
> -     if (F->set == NULL) {
> -             fpos = 0;
> -             cfieldc = 0;
> -     }
> -     else {
> -             fpos = F->set->fpos;
> -             cfieldc = F->set->cfieldc;
> -     }
> -     slurpit(F);
> -     if (F->set == NULL)
> -             return;
> -     else if (fpos != F->set->fpos)
> -             F->set->cfieldc = cfieldc+1;
> -}
> -
>  void
> -slurpit(INPUT *F)
> +slurp(INPUT *F)
>  {
>       LINE *lp, *lastlp, tmp;
>       ssize_t len;
>       size_t linesize;
>       u_long cnt;
>       char *bp, *fieldp, *line;
> -     long fpos;
> +
>       /*
>        * Read all of the lines from an input file that have the same
>        * join field.
> @@ -311,7 +275,7 @@ slurpit(INPUT *F)
>       F->setcnt = 0;
>       line = NULL;
>       linesize = 0;
> -     for (lastlp = NULL; ; ++F->setcnt, lastlp = lp) {
> +     for (lastlp = NULL; ; ++F->setcnt) {
>               /*
>                * If we're out of space to hold line structures, allocate
>                * more.  Initialize the structure so that we know that this
> @@ -339,6 +303,8 @@ slurpit(INPUT *F)
>                * level of indirection, but it's probably okay as is.
>                */
>               lp = &F->set[F->setcnt];
> +             if (F->setcnt)
> +                     lastlp = &F->set[F->setcnt - 1];
>               if (F->pushbool) {
>                       tmp = F->set[F->setcnt];
>                       F->set[F->setcnt] = F->set[F->pushback];
> @@ -348,11 +314,6 @@ slurpit(INPUT *F)
>               }
>               if ((len = getline(&line, &linesize, F->fp)) == -1)
>                       break;
> -             /*
> -              * we depend on knowing on what field we are, one safe way is
> -              * the file position.
> -             */
> -             fpos = ftell(F->fp) - len;
>  
>               /* Remove trailing newline, if it exists, and copy line. */
>               if (line[len - 1] == '\n')
> @@ -366,10 +327,8 @@ slurpit(INPUT *F)
>                       lp->line = p;
>                       lp->linealloc = newsize;
>               }
> -             F->setusedc++;
>               memcpy(lp->line, line, len);
>               lp->line[len] = '\0';
> -             lp->fpos = fpos;
>  
>               /* Split the line into fields, allocate space as necessary. */
>               lp->fieldcnt = 0;

Reply via email to