join can fail on large files

Andrew D Jewell Thu, 14 Feb 2002 11:52:45 -0800

The way join works is this :

For a given match, read all the matching lines from both files into 
memory, and then print out the join results.


The problem :

If you're joining really big files (hundreds of gigabytes), the 
matching lines from one file or the other can exceed your process 
limit (no more than 2 gig, usually)

The solution :

Read the two lists in parallel, until one is done, and then use the 
other, one line at a time, from disk.

The result :

No performance hit that I can find.
Peak memory requirement vastly reduced.
Some joins work that previously crashed.

The patch (from join.c in 2.0.20) :

--- src/join.c-orig  Wed Feb 13 09:35:13 2002
+++ src/join.c      Thu Feb 14 10:05:00 2002
@@ -494,6 +494,7 @@
    struct seq seq1, seq2;
    struct line line;
    int diff, i, j, eof1, eof2;
+  int end1, end2;

    /* Read the first line of each file.  */
    initseq (&seq1);
@@ -523,35 +524,85 @@
           continue;
         }

-      /* Keep reading lines from file1 as long as they continue to
-         match the current line from file2.  */
+      /* Read lines from file1 and file2 until one of them stops 
matching the other */
        eof1 = 0;
-      do
-       if (!getseq (fp1, &seq1))
-         {
-           eof1 = 1;
-           ++seq1.count;
-           break;
-         }
-      while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
-
-      /* Keep reading lines from file2 as long as they continue to
-         match the current line from file1.  */
        eof2 = 0;
-      do
-       if (!getseq (fp2, &seq2))
-         {
-           eof2 = 1;
-           ++seq2.count;
-           break;
-         }
-      while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
+      end1 = 0;
+      end2 = 0;
+
+      while (1)
+       {
+         if (!getseq (fp1, &seq1))
+           {
+             eof1 = 1;
+             end1 = 1;
+             ++seq1.count;
+             break;
+           }
+
+         if (keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]))
+           {
+             end1 = 1;
+             break;
+           }

-      if (print_pairables)
+         if (!getseq (fp2, &seq2))
+           {
+             eof2 = 1;
+             end2 = 1;
+             ++seq2.count;
+             break;
+           }
+         if (keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]))
+           {
+             end2 = 1;
+             break;
+           }
+       }
+
+      if (end1)
+       {
+         for (i = 0; i < seq2.count; ++i)
+           {
+             for (j = 0; j < seq1.count - 1; ++j)
+               if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[i]);
+             freeline (&seq2.lines[i]);
+           }
+         while (1) {
+           seq2.count = 0;
+           if (!getseq (fp2, &seq2))
+             {
+               eof2 = 1;
+               ++seq2.count;
+               break;
+             }
+           if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
+           for (j = 0; j < seq1.count - 1; ++j)
+             if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[0]);
+           freeline (&seq2.lines[0]);
+         }
+       }
+      else /* end2 */
         {
-         for (i = 0; i < seq1.count - 1; ++i)
+         for (i = 0; i < seq1.count; ++i)
+           {
+             for (j = 0; j < seq2.count - 1; ++j)
+               if (print_pairables) prjoin (&seq1.lines[i], &seq2.lines[j]);
+             freeline (&seq1.lines[i]);
+           }
+         while (1) {
+           seq1.count = 0;
+           if (!getseq (fp1, &seq1))
+             {
+               eof1 = 1;
+               ++seq1.count;
+               break;
+             }
+           if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
             for (j = 0; j < seq2.count - 1; ++j)
-             prjoin (&seq1.lines[i], &seq2.lines[j]);
+             if (print_pairables) prjoin (&seq1.lines[0], &seq2.lines[j]);
+           freeline (&seq1.lines[0]);
+         }
         }

        for (i = 0; i < seq1.count - 1; ++i)

_______________________________________________
Bug-textutils mailing list
[EMAIL PROTECTED]
http://mail.gnu.org/mailman/listinfo/bug-textutils

join can fail on large files

Reply via email to