Module Name: src Committed By: dsl Date: Tue Aug 18 18:00:28 UTC 2009
Modified Files: src/usr.bin/sort: append.c files.c fsort.c sort.c sort.h Log Message: The code that attempted to sort large files by sorting each chunk by the first key byte and writing to a temp file, then sorting the records from each temp file that had the same first key byte (and repeating for upto 4 key bytes) was a nice idea, but completely doomed to failure. Eg PR/9308 where a 70MB file has all but one record the same and short keys. Not only does the code not work, it is rather guaranteed to be slow. Instead always use a merge sort for fully sorted chunk of records (each temporary file contains one lot of sorted records). The -H option already did this, so just rip out all the code and variables that can't be used when -H was specified. Further cleanup to come ... To generate a diff of this commit: cvs rdiff -u -r1.17 -r1.18 src/usr.bin/sort/append.c cvs rdiff -u -r1.33 -r1.34 src/usr.bin/sort/files.c cvs rdiff -u -r1.36 -r1.37 src/usr.bin/sort/fsort.c cvs rdiff -u -r1.49 -r1.50 src/usr.bin/sort/sort.c cvs rdiff -u -r1.22 -r1.23 src/usr.bin/sort/sort.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/usr.bin/sort/append.c diff -u src/usr.bin/sort/append.c:1.17 src/usr.bin/sort/append.c:1.18 --- src/usr.bin/sort/append.c:1.17 Sun Aug 16 20:02:04 2009 +++ src/usr.bin/sort/append.c Tue Aug 18 18:00:28 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: append.c,v 1.17 2009/08/16 20:02:04 dsl Exp $ */ +/* $NetBSD: append.c,v 1.18 2009/08/18 18:00:28 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -64,7 +64,7 @@ #include "sort.h" #ifndef lint -__RCSID("$NetBSD: append.c,v 1.17 2009/08/16 20:02:04 dsl Exp $"); +__RCSID("$NetBSD: append.c,v 1.18 2009/08/18 18:00:28 dsl Exp $"); __SCCSID("@(#)append.c 8.1 (Berkeley) 6/6/93"); #endif /* not lint */ @@ -183,38 +183,3 @@ put(crec, fp); } } - -/* - * output the already sorted eol bin. - */ -void -rd_append(int binno, int infl0, int nfiles, FILE *outfp, u_char *buffer, - u_char *bufend) -{ - RECHEADER *rec; - - rec = (RECHEADER *) buffer; - if (!getnext(binno, infl0, NULL, nfiles, - (RECHEADER *) buffer, bufend, 0)) { - putline(rec, outfp); - while (getnext(binno, infl0, NULL, nfiles, (RECHEADER *) buffer, - bufend, 0) == 0) { - if (!UNIQUE) - putline(rec, outfp); - } - } -} - -/* - * append plain text--used after sorting the biggest bin. - */ -void -concat(FILE *a, FILE *b) -{ - int nread; - char buffer[4096]; - - rewind(b); - while ((nread = fread(buffer, 1, 4096, b)) > 0) - EWRITE(buffer, 1, nread, a); -} Index: src/usr.bin/sort/files.c diff -u src/usr.bin/sort/files.c:1.33 src/usr.bin/sort/files.c:1.34 --- src/usr.bin/sort/files.c:1.33 Sun Aug 16 19:53:43 2009 +++ src/usr.bin/sort/files.c Tue Aug 18 18:00:28 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: files.c,v 1.33 2009/08/16 19:53:43 dsl Exp $ */ +/* $NetBSD: files.c,v 1.34 2009/08/18 18:00:28 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -65,7 +65,7 @@ #include "fsort.h" #ifndef lint -__RCSID("$NetBSD: files.c,v 1.33 2009/08/16 19:53:43 dsl Exp $"); +__RCSID("$NetBSD: files.c,v 1.34 2009/08/18 18:00:28 dsl Exp $"); __SCCSID("@(#)files.c 8.1 (Berkeley) 6/6/93"); #endif /* not lint */ @@ -74,73 +74,6 @@ static ssize_t seq(FILE *, u_char **); /* - * this is the subroutine for file management for fsort(). - * It keeps the buffers for all temporary files. - */ -int -getnext(int binno, int infl0, struct filelist *filelist, int nfiles, - RECHEADER *pos, u_char *end, struct field *dummy) -{ - int i; - u_char *hp; - static size_t nleft = 0; - static int cnt = 0, flag = -1; - static u_char maxb = 0; - static FILE *fp; - - if (nleft == 0) { - if (binno < 0) /* reset files. */ { - for (i = 0; i < nfiles; i++) { - rewind(fstack[infl0 + i].fp); - fstack[infl0 + i].max_o = 0; - } - flag = -1; - nleft = cnt = 0; - return (-1); - } - maxb = fstack[infl0].maxb; - for (; nleft == 0; cnt++) { - if (cnt >= nfiles) { - cnt = 0; - return (EOF); - } - fp = fstack[infl0 + cnt].fp; - fread(&nleft, sizeof(nleft), 1, fp); - if (binno < maxb) - fstack[infl0+cnt].max_o - += sizeof(nleft) + nleft; - else if (binno == maxb) { - if (binno != fstack[infl0].lastb) { - fseeko(fp, fstack[infl0+ - cnt].max_o, SEEK_SET); - fread(&nleft, sizeof(nleft), 1, fp); - } - if (nleft == 0) - fclose(fp); - } else if (binno == maxb + 1) { /* skip a bin */ - fseek(fp, nleft, SEEK_CUR); - fread(&nleft, sizeof(nleft), 1, fp); - flag = cnt; - } - } - } - if ((u_char *) pos > end - REC_DATA_OFFSET) - return (BUFFEND); - fread(pos, REC_DATA_OFFSET, 1, fp); - if (end - pos->data < (ptrdiff_t)pos->length) { - hp = ((u_char *)pos) + REC_DATA_OFFSET; - for (i = REC_DATA_OFFSET; i ; i--) - ungetc(*--hp, fp); - return (BUFFEND); - } - fread(pos->data, pos->length, 1, fp); - nleft -= pos->length + REC_DATA_OFFSET; - if (nleft == 0 && binno == fstack[infl0].maxb) - fclose(fp); - return (0); -} - -/* * this is called when there is no special key. It's only called * in the first fsort pass. */ Index: src/usr.bin/sort/fsort.c diff -u src/usr.bin/sort/fsort.c:1.36 src/usr.bin/sort/fsort.c:1.37 --- src/usr.bin/sort/fsort.c:1.36 Sun Aug 16 20:02:04 2009 +++ src/usr.bin/sort/fsort.c Tue Aug 18 18:00:28 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: fsort.c,v 1.36 2009/08/16 20:02:04 dsl Exp $ */ +/* $NetBSD: fsort.c,v 1.37 2009/08/18 18:00:28 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -62,17 +62,17 @@ */ /* - * Read in the next bin. If it fits in one segment sort it; - * otherwise refine it by segment deeper by one character, - * and try again on smaller bins. Sort the final bin at this level - * of recursion to keep the head of fstack at 0. - * After PANIC passes, abort to merge sort. + * Read in a block of records (until 'enough'). + * sort, write to temp file. + * Merge sort temp files into output file + * Small files miss out the temp file stage. + * Large files might get multiple merges. */ #include "sort.h" #include "fsort.h" #ifndef lint -__RCSID("$NetBSD: fsort.c,v 1.36 2009/08/16 20:02:04 dsl Exp $"); +__RCSID("$NetBSD: fsort.c,v 1.37 2009/08/18 18:00:28 dsl Exp $"); __SCCSID("@(#)fsort.c 8.1 (Berkeley) 6/6/93"); #endif /* not lint */ @@ -100,17 +100,13 @@ const u_char **keypos; u_char *bufend; u_char *weights; - int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0; - int c, nelem, base; - long sizes [NBINS + 1]; + int ntfiles, mfct = 0; + int c, nelem; get_func_t get; struct recheader *crec; struct field tfield[2]; - FILE *prevfp, *tailfp[FSORTMAX + 1]; u_char *nbuffer; - memset(tailfp, 0, sizeof(tailfp)); - prevfp = outfp; memset(tfield, 0, sizeof(tfield)); if (ftbl[0].flags & R) tfield[0].weights = Rascii; @@ -124,256 +120,113 @@ memset(keylist, 0, MAXNUM * sizeof(u_char *)); } bufend = buffer + bufsize; - if (binno >= 0) { - base = top + nfiles; - get = getnext; - } else { - base = 0; - if (SINGL_FLD) - get = makeline; - else - get = makekey; - } - for (;;) { - memset(sizes, 0, sizeof(sizes)); - c = nelem = ntfiles = 0; /* XXXGCC -Wuninitialized m68k */ - if (binno == weights[REC_D] && - !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */ - rd_append(weights[REC_D], top, - nfiles, prevfp, buffer, bufend); - break; - } else if (binno == weights[REC_D]) { - depth = 0; /* start over on flat weights */ - ftbl = tfield; - weights = ftbl[0].weights; - } - keypos = keylist; /* XXXGCC -Wuninitialized m68k */ - crec = (RECHEADER *) buffer; /* XXXGCC -Wuninitialized m68k */ - while (c != EOF) { - keypos = keylist; - nelem = 0; - crec = (RECHEADER *) buffer; - - do_read: - while ((c = get(binno, top, filelist, nfiles, crec, - bufend, ftbl)) == 0) { - *keypos++ = crec->data + depth; - if (++nelem == MAXNUM) { - c = BUFFEND; - break; - } - crec =(RECHEADER *)((char *) crec + - SALIGN(crec->length) + REC_DATA_OFFSET); - } - - if (c == BUFFEND && nelem < MAXNUM - && bufsize < MAXBUFSIZE) { - const u_char **keyp; - u_char *oldb = buffer; - - /* buffer was too small for data, allocate - * bigger buffer */ - nbuffer = realloc(buffer, bufsize * 2); - if (!nbuffer) { - err(2, "failed to realloc buffer to %lu bytes", - (unsigned long) bufsize * 2); - } - buffer = nbuffer; - bufsize *= 2; - bufend = buffer + bufsize; - - /* patch up keylist[] */ - for (keyp = &keypos[-1]; keyp >= keylist; keyp--) - *keyp = buffer + (*keyp - oldb); - - crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb)); - goto do_read; - } - - if (c != BUFFEND && !ntfiles && !mfct) { - /* do not push */ - continue; - } + if (SINGL_FLD) + get = makeline; + else + get = makekey; - /* push */ - if (panic >= PANIC) { - fstack[MSTART + mfct].fp = ftmp(); - if ((stable_sort) - ? sradixsort(keylist, nelem, - weights, REC_D) - : radixsort(keylist, nelem, - weights, REC_D) ) - err(2, NULL); - append(keylist, nelem, depth, - fstack[MSTART + mfct].fp, putrec, - ftbl); - mfct++; - /* reduce number of open files */ - if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) { - /* - * Only copy extra incomplete crec - * data if there are any. - */ - int nodata = (bufend >= (u_char *)crec - && bufend <= crec->data); - size_t sz=0; - u_char *tmpbuf=NULL; - - if (!nodata) { - sz = bufend - crec->data; - tmpbuf = malloc(sz); - memmove(tmpbuf, crec->data, sz); - } - - CHECKFSTACK(base + ntfiles); - fstack[base + ntfiles].fp = ftmp(); - fmerge(0, MSTART, filelist, - mfct, geteasy, - fstack[base + ntfiles].fp, - putrec, ftbl); - ntfiles++; - mfct = 0; - - if (!nodata) { - memmove(crec->data, tmpbuf, sz); - free(tmpbuf); - } - } - } else { - CHECKFSTACK(base + ntfiles); - fstack[base + ntfiles].fp = ftmp(); - onepass(keylist, depth, nelem, sizes, - weights, fstack[base + ntfiles].fp); - ntfiles++; + c = nelem = ntfiles = 0; /* XXXGCC -Wuninitialized m68k */ + keypos = keylist; /* XXXGCC -Wuninitialized m68k */ + crec = (RECHEADER *) buffer; /* XXXGCC -Wuninitialized m68k */ + while (c != EOF) { + keypos = keylist; + nelem = 0; + crec = (RECHEADER *) buffer; + + do_read: + while ((c = get(-1, top, filelist, nfiles, crec, + bufend, ftbl)) == 0) { + *keypos++ = crec->data + depth; + if (++nelem == MAXNUM) { + c = BUFFEND; + break; } + crec =(RECHEADER *)((char *) crec + + SALIGN(crec->length) + REC_DATA_OFFSET); } - if (!ntfiles && !mfct) { /* everything in memory--pop */ - if (nelem > 1 - && ((stable_sort) - ? sradixsort(keylist, nelem, weights, REC_D) - : radixsort(keylist, nelem, weights, REC_D) )) - err(2, NULL); - if (nelem > 0) - append(keylist, nelem, depth, outfp, putline, ftbl); - break; /* pop */ - } - if (panic >= PANIC) { - if (!ntfiles) - fmerge(0, MSTART, filelist, mfct, geteasy, - outfp, putline, ftbl); - else - fmerge(0, base, filelist, ntfiles, geteasy, - outfp, putline, ftbl); - break; - - } - total = maxb = lastb = 0; /* find if one bin dominates */ - for (i = 0; i < NBINS; i++) - if (sizes[i]) { - if (sizes[i] > sizes[maxb]) - maxb = i; - lastb = i; - total += sizes[i]; + + if (c == BUFFEND && nelem < MAXNUM + && bufsize < MAXBUFSIZE) { + const u_char **keyp; + u_char *oldb = buffer; + + /* buffer was too small for data, allocate + * bigger buffer */ + nbuffer = realloc(buffer, bufsize * 2); + if (!nbuffer) { + err(2, "failed to realloc buffer to %lu bytes", + (unsigned long) bufsize * 2); } - if (sizes[maxb] < max((total / 2) , BUFSIZE)) - maxb = lastb; /* otherwise pop after last bin */ - fstack[base].lastb = lastb; - fstack[base].maxb = maxb; - - /* start refining next level. */ - getnext(-1, base, NULL, ntfiles, crec, bufend, 0); /* rewind */ - for (i = 0; i < maxb; i++) { - if (!sizes[i]) /* bin empty; step ahead file offset */ - getnext(i, base, NULL,ntfiles, crec, bufend, 0); - else - fsort(i, depth + 1, base, filelist, ntfiles, - outfp, ftbl); - } + buffer = nbuffer; + bufsize *= 2; + bufend = buffer + bufsize; - get = getnext; + /* patch up keylist[] */ + for (keyp = &keypos[-1]; keyp >= keylist; keyp--) + *keyp = buffer + (*keyp - oldb); - if (lastb != maxb) { - if (prevfp != outfp) - tailfp[panic] = prevfp; - prevfp = ftmp(); - for (i = maxb + 1; i <= lastb; i++) - if (!sizes[i]) - getnext(i, base, NULL, ntfiles, crec, - bufend,0); - else - fsort(i, depth + 1, base, filelist, - ntfiles, prevfp, ftbl); + crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb)); + goto do_read; } - /* sort biggest (or last) bin at this level */ - depth++; - panic++; - binno = maxb; - top = base; - nfiles = ntfiles; /* so overwrite them */ - } - if (prevfp != outfp) { - concat(outfp, prevfp); - fclose(prevfp); - } - for (i = panic; i >= 0; --i) - if (tailfp[i]) { - concat(outfp, tailfp[i]); - fclose(tailfp[i]); + if (c != BUFFEND && !ntfiles && !mfct) { + /* do not push */ + continue; } - /* If on top level, free our structures */ - if (depth == 0) { - free(keylist), keylist = NULL; - free(buffer), buffer = NULL; - } -} + /* push */ + fstack[MSTART + mfct].fp = ftmp(); + if (radix_sort(keylist, nelem, weights, REC_D)) + err(2, NULL); + append(keylist, nelem, depth, fstack[MSTART + mfct].fp, putrec, + ftbl); + mfct++; + /* reduce number of open files */ + if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) { + /* + * Only copy extra incomplete crec + * data if there are any. + */ + int nodata = (bufend >= (u_char *)crec + && bufend <= crec->data); + size_t sz=0; + u_char *tmpbuf=NULL; + + if (!nodata) { + sz = bufend - crec->data; + tmpbuf = malloc(sz); + memmove(tmpbuf, crec->data, sz); + } -/* - * This is one pass of radix exchange, dumping the bins to disk. - */ -#define swap(a, b, t) t = a, a = b, b = t -void -onepass(const u_char **a, int depth, long n, long sizes[], u_char *tr, FILE *fp) -{ - size_t tsizes[NBINS + 1]; - const u_char **bin[257], ***bp, ***bpmax, **top[256], ***tp; - static int histo[256]; - int *hp; - int c; - int hdr_off; - const u_char **an, *t, **aj; - const u_char **ak, *r; - - memset(tsizes, 0, sizeof(tsizes)); - hdr_off = REC_DATA_OFFSET + depth; - an = &a[n]; - for (ak = a; ak < an; ak++) { - histo[c = tr[**ak]]++; - tsizes[c] += ((const RECHEADER *) (*ak -= hdr_off))->length; + CHECKFSTACK(ntfiles); + fstack[ntfiles].fp = ftmp(); + fmerge(0, MSTART, filelist, mfct, geteasy, + fstack[ntfiles].fp, putrec, ftbl); + ntfiles++; + mfct = 0; + + if (!nodata) { + memmove(crec->data, tmpbuf, sz); + free(tmpbuf); + } + } } - bin[0] = a; - bpmax = bin + 256; - tp = top, hp = histo; - for (bp = bin; bp < bpmax; bp++) { - *tp++ = *(bp + 1) = *bp + (c = *hp); - *hp++ = 0; - if (c <= 1) - continue; - } - for (aj = a; aj < an; *aj = r, aj = bin[c + 1]) - for (r = *aj; aj < (ak = --top[c = tr[r[hdr_off]]]) ;) - swap(*ak, r, t); - - for (ak = a, c = 0; c < 256; c++) { - an = bin[c + 1]; - n = an - ak; - tsizes[c] += n * REC_DATA_OFFSET; - /* tell getnext how many elements in this bin, this segment. */ - EWRITE(&tsizes[c], sizeof(size_t), 1, fp); - sizes[c] += tsizes[c]; - for (; ak < an; ++ak) - putrec((const RECHEADER *) *ak, fp); - } + if (!ntfiles && !mfct) { /* everything in memory--pop */ + if (nelem > 1 && radix_sort(keylist, nelem, weights, REC_D)) + err(2, NULL); + if (nelem > 0) + append(keylist, nelem, depth, outfp, putline, ftbl); + } + if (!ntfiles) + fmerge(0, MSTART, filelist, mfct, geteasy, + outfp, putline, ftbl); + else + fmerge(0, 0, filelist, ntfiles, geteasy, + outfp, putline, ftbl); + + free(keylist); + keylist = NULL; + free(buffer); + buffer = NULL; } Index: src/usr.bin/sort/sort.c diff -u src/usr.bin/sort/sort.c:1.49 src/usr.bin/sort/sort.c:1.50 --- src/usr.bin/sort/sort.c:1.49 Sat Aug 15 09:48:46 2009 +++ src/usr.bin/sort/sort.c Tue Aug 18 18:00:28 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: sort.c,v 1.49 2009/08/15 09:48:46 dsl Exp $ */ +/* $NetBSD: sort.c,v 1.50 2009/08/18 18:00:28 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -76,7 +76,7 @@ #endif /* not lint */ #ifndef lint -__RCSID("$NetBSD: sort.c,v 1.49 2009/08/15 09:48:46 dsl Exp $"); +__RCSID("$NetBSD: sort.c,v 1.50 2009/08/18 18:00:28 dsl Exp $"); __SCCSID("@(#)sort.c 8.1 (Berkeley) 6/6/93"); #endif /* not lint */ @@ -105,6 +105,7 @@ * Default to stable sort. */ int stable_sort = 1; +int (*radix_sort)(const u_char **, int, const u_char *, u_int) = sradixsort; static char toutpath[MAXPATHLEN]; @@ -169,7 +170,8 @@ fldtab->flags |= tmp; break; case 'H': - PANIC = 0; + /* -H was ; use merge sort for blocks of large files' */ + /* That is now the default. */ break; case 'k': p = realloc(fldtab, (fldtab_sz + 1) * sizeof(*fldtab)); @@ -191,9 +193,11 @@ case 's': /* for GNU sort compatibility (this is our default) */ stable_sort = 1; + radix_sort = radixsort; break; case 'S': stable_sort = 0; + radix_sort = sradixsort; break; case 't': if (SEP_FLAG) Index: src/usr.bin/sort/sort.h diff -u src/usr.bin/sort/sort.h:1.22 src/usr.bin/sort/sort.h:1.23 --- src/usr.bin/sort/sort.h:1.22 Sun Aug 16 19:53:43 2009 +++ src/usr.bin/sort/sort.h Tue Aug 18 18:00:28 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: sort.h,v 1.22 2009/08/16 19:53:43 dsl Exp $ */ +/* $NetBSD: sort.h,v 1.23 2009/08/18 18:00:28 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -160,13 +160,13 @@ RECHEADER *, u_char *, struct field *); typedef void (*put_func_t)(const struct recheader *, FILE *); -extern int PANIC; /* maximum depth of fsort before fmerge is called */ extern u_char ascii[NBINS], Rascii[NBINS], Ftable[NBINS], RFtable[NBINS]; extern u_char d_mask[NBINS]; extern int SINGL_FLD, SEP_FLAG, UNIQUE; extern int REC_D; extern const char *tmpdir; extern int stable_sort; +extern int (*radix_sort)(const u_char **, int, const u_char *, u_int); extern u_char gweights[NBINS]; extern struct coldesc *clist; extern int ncols; @@ -184,8 +184,6 @@ struct field *); int geteasy(int, int, struct filelist *, int, RECHEADER *, u_char *, struct field *); -int getnext(int, int, struct filelist *, - int, RECHEADER *, u_char *, struct field *); int makekey(int, int, struct filelist *, int, RECHEADER *, u_char *, struct field *); int makeline(int, int, struct filelist *,