When joining large files where one file has many many many instances 
of the same matching field, memory use is excessive (since join read 
all matching lines from both files into RAM before proceeding).

Below are diffs that solve this problem.

Andy Jewell
[EMAIL PROTECTED]


--- textutils-2.0/src/join.c    Sun Jul  4 03:38:02 1999
+++ join.c      Wed Oct 25 11:27:21 2000
@@ -486,6 +501,7 @@
    struct seq seq1, seq2;
    struct line line;
    int diff, i, j, eof1, eof2;
+  int end1, end2;

    /* Read the first line of each file.  */
    initseq (&seq1);
@@ -515,35 +531,85 @@
           continue;
         }

-      /* Keep reading lines from file1 as long as they continue to
-         match the current line from file2.  */
+      /* Read lines from file1 and file2 until one of them stops 
matching the other */
        eof1 = 0;
-      do
-       if (!getseq (fp1, &seq1))
-         {
-           eof1 = 1;
-           ++seq1.count;
-           break;
-         }
-      while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
-
-      /* Keep reading lines from file2 as long as they continue to
-         match the current line from file1.  */
        eof2 = 0;
-      do
-       if (!getseq (fp2, &seq2))
-         {
-           eof2 = 1;
-           ++seq2.count;
-           break;
-         }
-      while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
+      end1 = 0;
+      end2 = 0;

-      if (print_pairables)
+      while (1)
         {
-         for (i = 0; i < seq1.count - 1; ++i)
+         if (!getseq (fp1, &seq1))
+           {
+             eof1 = 1;
+             end1 = 1;
+             ++seq1.count;
+             break;
+           }
+
+         if (keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]))
+           {
+             end1 = 1;
+             break;
+           }
+
+         if (!getseq (fp2, &seq2))
+           {
+             eof2 = 1;
+             end2 = 1;
+             ++seq2.count;
+             break;
+           }
+         if (keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]))
+           {
+             end2 = 1;
+             break;
+           }
+       }
+
+      if (end1)
+       {
+         for (i = 0; i < seq2.count; ++i)
+           {
+             for (j = 0; j < seq1.count - 1; ++j)
+               if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[i]);
+             freeline (&seq2.lines[i]);
+           }
+         while (1) {
+           seq2.count = 0;
+           if (!getseq (fp2, &seq2))
+             {
+               eof2 = 1;
+               ++seq2.count;
+               break;
+             }
+           if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
+           for (j = 0; j < seq1.count - 1; ++j)
+             if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[0]);
+           freeline (&seq2.lines[0]);
+         }
+       }
+      else /* end2 */
+       {
+         for (i = 0; i < seq1.count; ++i)
+           {
+             for (j = 0; j < seq2.count - 1; ++j)
+               if (print_pairables) prjoin (&seq1.lines[i], &seq2.lines[j]);
+             freeline (&seq1.lines[i]);
+           }
+         while (1) {
+           seq1.count = 0;
+           if (!getseq (fp1, &seq1))
+             {
+               eof1 = 1;
+               ++seq1.count;
+               break;
+             }
+           if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
             for (j = 0; j < seq2.count - 1; ++j)
-             prjoin (&seq1.lines[i], &seq2.lines[j]);
+             if (print_pairables) prjoin (&seq1.lines[0], &seq2.lines[j]);
+           freeline (&seq1.lines[0]);
+         }
         }

        for (i = 0; i < seq1.count - 1; ++i)
@@ -555,7 +621,7 @@
         }
        else
         seq1.count = 0;
-
+
        for (i = 0; i < seq2.count - 1; ++i)
         freeline (&seq2.lines[i]);
        if (!eof2)

_______________________________________________
Bug-textutils mailing list
[EMAIL PROTECTED]
http://mail.gnu.org/mailman/listinfo/bug-textutils

Reply via email to