Phrase searching can get extremely slow due to n**2 behaviour
when there are large numbers of matches for each word. 

This patch to parser.cc makes it linear, could someone apply it

--- parser.cc.old       Thu Sep 20 11:05:30 2001
+++ parser.cc   Thu Sep 20 11:07:03 2001
@@ -20,6 +20,7 @@
 #include "parser.h"
 #include "HtPack.h"
 #include "Collection.h"
+#include "Dictionary.h"
 
 #define        WORD    1000
 #define        DONE    1001
@@ -350,24 +351,50 @@
     // OK, now we have a previous list in wordList and a new list
     List       *results = new List;
 
+    Dictionary  newDict(5000);
+
+    String nid; 
+    newWords->Start_Get();
+    while ((newWord = (HtWordReference *) newWords->Get_Next()))
+      {
+        nid = "";
+        int did =  newWord->DocID();
+        nid << did;
+        nid << "-";
+        int loc = newWord->Location();
+        nid << loc;
+        if (! newDict.Exists(nid)) { 
+           newDict.Add(nid, (Object *)newWord);
+        } else {
+//         cerr << "perform_phrase: NewWords Duplicate: " << nid << "\n";
+//          Double addition is a problem if you don't want your original objects 
+deleted
+        }
+      }
+
+    String oid;
     oldWords->Start_Get();
     while ((oldWord = (HtWordReference *) oldWords->Get_Next()))
       {
-       newWords->Start_Get();
-       while ((newWord = (HtWordReference *) newWords->Get_Next()))
-         {
-           if (oldWord->DocID() == newWord->DocID())
-             if ((oldWord->Location() + 1) == newWord->Location())
-               {
-                 HtWordReference *result = new HtWordReference(*oldWord);
+        oid = "";
+        int did =  oldWord->DocID();
+        oid << did;
+        oid << "-";
+        int loc = oldWord->Location();
+        oid << loc+1;
+         if (newDict.Exists(oid)) 
+          {
+          newWord = (HtWordReference *)newDict.Find(oid);
+           
+          HtWordReference *result = new HtWordReference(*oldWord);
 
-                 result->Flags(oldWord->Flags() & newWord->Flags());
-                 result->Location(newWord->Location());
+          result->Flags(oldWord->Flags() & newWord->Flags());
+          result->Location(newWord->Location());
                  
-                 results->Add(result);
-               }
+          results->Add(result);
          }
       }
+
+    newDict.Release();
 
     if(debug) cerr << "old words count: " << oldWords->Count() << endl;
     if(debug) cerr << "results count: " << results->Count() << endl;

-- 
 Toivo Pedaste                        Email:  [EMAIL PROTECTED]
 University Communications Services,  Phone:  +61 8 9 380 2605
 University of Western Australia      Fax:    +61 8 9 380 1109
"The time has come", the Walrus said, "to talk of many things"...

_______________________________________________
htdig-dev mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/htdig-dev

Reply via email to