On Sun, Oct 07, 2018 at 10:08:08AM +0200, Otto Moerbeek wrote:
> On Sat, Oct 06, 2018 at 08:57:35PM +0200, Andreas Kusalananda Kähäri wrote:
>
> > >Synopsis: Wrong result in right and full outer join with join(1)
> > >Category: system
> > >Environment:
> > System : OpenBSD 6.3
> > Details : OpenBSD 6.3 (GENERIC.MP) #11: Thu Sep 20 16:05:37
> > CEST 2018
> >
> > [email protected]:/usr/src/sys/arch/amd64/compile/GENERIC.MP
> >
> > Architecture: OpenBSD.amd64
> > Machine : amd64
> > >Description:
> >
> > The join(1) utility produces the wrong result when performing a right
> > join or a full outer join between two files under some circumstances.
[cut]
>
> Hi,
>
> This diff fixes your case. In the past, we cherry picked some changes
> from FreeBSD, but for some reason the main logic was left alone, while
> our version was overly complex. FreeBSDs main loop is more simple and
> easy to understand.
>
> This need carefull scrutiny, since an error is easily made and I mostly
> did this without enough coffee.
>
> -Otto
>
I can only confirm that the patch does indeed appear to fix the issue I
was having. As for any other cases, I can't say anything.
Thanks,
Andreas
> Index: join.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/join/join.c,v
> retrieving revision 1.28
> diff -u -p -r1.28 join.c
> --- join.c 18 Jul 2018 17:20:54 -0000 1.28
> +++ join.c 7 Oct 2018 07:58:02 -0000
> @@ -53,8 +53,6 @@ typedef struct {
> char **fields; /* line field(s) */
> u_long fieldcnt; /* line field(s) count */
> u_long fieldalloc; /* line field(s) allocated count */
> - u_long cfieldc; /* current field count */
> - long fpos; /* fpos of start of field */
> } LINE;
>
> typedef struct {
> @@ -67,10 +65,9 @@ typedef struct {
> u_long pushback; /* line on the stack */
> u_long setcnt; /* set count */
> u_long setalloc; /* set allocated count */
> - u_long setusedc; /* sets used */
> } INPUT;
> -INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0, 0 },
> - input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0, 0 };
> +INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0 },
> + input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0 };
>
> typedef struct {
> u_long filenum; /* file number */
> @@ -214,12 +211,8 @@ main(int argc, char *argv[])
> if (pledge("stdio", NULL) == -1)
> err(1, "pledge");
>
> - F1->setusedc = 0;
> - F2->setusedc = 0;
> slurp(F1);
> slurp(F2);
> - F1->set->cfieldc = 0;
> - F2->set->cfieldc = 0;
>
> /*
> * We try to let the files have the same field value, advancing
> @@ -234,24 +227,16 @@ main(int argc, char *argv[])
> joinlines(F1, F2);
> slurp(F1);
> slurp(F2);
> - }
> - else {
> - if (F1->unpair
> - && (cval < 0 || F2->set->cfieldc == F2->setusedc -1)) {
> + } else if (cval < 0) {
> + /* File 1 takes the lead... */
> + if (F1->unpair)
> joinlines(F1, NULL);
> - slurp(F1);
> - }
> - else if (cval < 0)
> - /* File 1 takes the lead... */
> - slurp(F1);
> - if (F2->unpair
> - && (cval > 0 || F1->set->cfieldc == F1->setusedc -1)) {
> + slurp(F1);
> + } else {
> + /* File 2 takes the lead... */
> + if (F2->unpair)
> joinlines(F2, NULL);
> - slurp(F2);
> - }
> - else if (cval > 0)
> - /* File 2 takes the lead... */
> - slurp(F2);
> + slurp(F2);
> }
> }
>
> @@ -273,36 +258,15 @@ main(int argc, char *argv[])
> return 0;
> }
>
> -/* wrapper around slurpit() to keep track of what field we are on */
> -void slurp(INPUT *F)
> -{
> - long fpos;
> - u_long cfieldc;
> -
> - if (F->set == NULL) {
> - fpos = 0;
> - cfieldc = 0;
> - }
> - else {
> - fpos = F->set->fpos;
> - cfieldc = F->set->cfieldc;
> - }
> - slurpit(F);
> - if (F->set == NULL)
> - return;
> - else if (fpos != F->set->fpos)
> - F->set->cfieldc = cfieldc+1;
> -}
> -
> void
> -slurpit(INPUT *F)
> +slurp(INPUT *F)
> {
> LINE *lp, *lastlp, tmp;
> ssize_t len;
> size_t linesize;
> u_long cnt;
> char *bp, *fieldp, *line;
> - long fpos;
> +
> /*
> * Read all of the lines from an input file that have the same
> * join field.
> @@ -311,7 +275,7 @@ slurpit(INPUT *F)
> F->setcnt = 0;
> line = NULL;
> linesize = 0;
> - for (lastlp = NULL; ; ++F->setcnt, lastlp = lp) {
> + for (lastlp = NULL; ; ++F->setcnt) {
> /*
> * If we're out of space to hold line structures, allocate
> * more. Initialize the structure so that we know that this
> @@ -339,6 +303,8 @@ slurpit(INPUT *F)
> * level of indirection, but it's probably okay as is.
> */
> lp = &F->set[F->setcnt];
> + if (F->setcnt)
> + lastlp = &F->set[F->setcnt - 1];
> if (F->pushbool) {
> tmp = F->set[F->setcnt];
> F->set[F->setcnt] = F->set[F->pushback];
> @@ -348,11 +314,6 @@ slurpit(INPUT *F)
> }
> if ((len = getline(&line, &linesize, F->fp)) == -1)
> break;
> - /*
> - * we depend on knowing on what field we are, one safe way is
> - * the file position.
> - */
> - fpos = ftell(F->fp) - len;
>
> /* Remove trailing newline, if it exists, and copy line. */
> if (line[len - 1] == '\n')
> @@ -366,10 +327,8 @@ slurpit(INPUT *F)
> lp->line = p;
> lp->linealloc = newsize;
> }
> - F->setusedc++;
> memcpy(lp->line, line, len);
> lp->line[len] = '\0';
> - lp->fpos = fpos;
>
> /* Split the line into fields, allocate space as necessary. */
> lp->fieldcnt = 0;
>
--
Andreas Kusalananda Kähäri,
National Bioinformatics Infrastructure Sweden (NBIS),
Uppsala University, Sweden.
När du har kontakt med oss på Uppsala universitet med e-post så innebär det att
vi behandlar dina personuppgifter. För att läsa mer om hur vi gör det kan du
läsa här: http://www.uu.se/om-uu/dataskydd-personuppgifter/
E-mailing Uppsala University means that we will process your personal data. For
more information on how this is performed, please read here:
http://www.uu.se/om-uu/dataskydd-personuppgifter/