The way join works is this : For a given match, read all the matching lines from both files into memory, and then print out the join results.
The problem : If you're joining really big files (hundreds of gigabytes), the matching lines from one file or the other can exceed your process limit (no more than 2 gig, usually) The solution : Read the two lists in parallel, until one is done, and then use the other, one line at a time, from disk. The result : No performance hit that I can find. Peak memory requirement vastly reduced. Some joins work that previously crashed. The patch (from join.c in 2.0.20) : --- src/join.c-orig Wed Feb 13 09:35:13 2002 +++ src/join.c Thu Feb 14 10:05:00 2002 @@ -494,6 +494,7 @@ struct seq seq1, seq2; struct line line; int diff, i, j, eof1, eof2; + int end1, end2; /* Read the first line of each file. */ initseq (&seq1); @@ -523,35 +524,85 @@ continue; } - /* Keep reading lines from file1 as long as they continue to - match the current line from file2. */ + /* Read lines from file1 and file2 until one of them stops matching the other */ eof1 = 0; - do - if (!getseq (fp1, &seq1)) - { - eof1 = 1; - ++seq1.count; - break; - } - while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0])); - - /* Keep reading lines from file2 as long as they continue to - match the current line from file1. */ eof2 = 0; - do - if (!getseq (fp2, &seq2)) - { - eof2 = 1; - ++seq2.count; - break; - } - while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1])); + end1 = 0; + end2 = 0; + + while (1) + { + if (!getseq (fp1, &seq1)) + { + eof1 = 1; + end1 = 1; + ++seq1.count; + break; + } + + if (keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0])) + { + end1 = 1; + break; + } - if (print_pairables) + if (!getseq (fp2, &seq2)) + { + eof2 = 1; + end2 = 1; + ++seq2.count; + break; + } + if (keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1])) + { + end2 = 1; + break; + } + } + + if (end1) + { + for (i = 0; i < seq2.count; ++i) + { + for (j = 0; j < seq1.count - 1; ++j) + if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[i]); + freeline (&seq2.lines[i]); + } + while (1) { + seq2.count = 0; + if (!getseq (fp2, &seq2)) + { + eof2 = 1; + ++seq2.count; + break; + } + if (keycmp (&seq1.lines[0], &seq2.lines[0])) break; + for (j = 0; j < seq1.count - 1; ++j) + if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[0]); + freeline (&seq2.lines[0]); + } + } + else /* end2 */ { - for (i = 0; i < seq1.count - 1; ++i) + for (i = 0; i < seq1.count; ++i) + { + for (j = 0; j < seq2.count - 1; ++j) + if (print_pairables) prjoin (&seq1.lines[i], &seq2.lines[j]); + freeline (&seq1.lines[i]); + } + while (1) { + seq1.count = 0; + if (!getseq (fp1, &seq1)) + { + eof1 = 1; + ++seq1.count; + break; + } + if (keycmp (&seq1.lines[0], &seq2.lines[0])) break; for (j = 0; j < seq2.count - 1; ++j) - prjoin (&seq1.lines[i], &seq2.lines[j]); + if (print_pairables) prjoin (&seq1.lines[0], &seq2.lines[j]); + freeline (&seq1.lines[0]); + } } for (i = 0; i < seq1.count - 1; ++i) _______________________________________________ Bug-textutils mailing list [EMAIL PROTECTED] http://mail.gnu.org/mailman/listinfo/bug-textutils