Phrase searching can get extremely slow due to n**2 behaviour
when there are large numbers of matches for each word.
This patch to parser.cc makes it linear, could someone apply it
--- parser.cc.old Thu Sep 20 11:05:30 2001
+++ parser.cc Thu Sep 20 11:07:03 2001
@@ -20,6 +20,7 @@
#include "parser.h"
#include "HtPack.h"
#include "Collection.h"
+#include "Dictionary.h"
#define WORD 1000
#define DONE 1001
@@ -350,24 +351,50 @@
// OK, now we have a previous list in wordList and a new list
List *results = new List;
+ Dictionary newDict(5000);
+
+ String nid;
+ newWords->Start_Get();
+ while ((newWord = (HtWordReference *) newWords->Get_Next()))
+ {
+ nid = "";
+ int did = newWord->DocID();
+ nid << did;
+ nid << "-";
+ int loc = newWord->Location();
+ nid << loc;
+ if (! newDict.Exists(nid)) {
+ newDict.Add(nid, (Object *)newWord);
+ } else {
+// cerr << "perform_phrase: NewWords Duplicate: " << nid << "\n";
+// Double addition is a problem if you don't want your original objects
+deleted
+ }
+ }
+
+ String oid;
oldWords->Start_Get();
while ((oldWord = (HtWordReference *) oldWords->Get_Next()))
{
- newWords->Start_Get();
- while ((newWord = (HtWordReference *) newWords->Get_Next()))
- {
- if (oldWord->DocID() == newWord->DocID())
- if ((oldWord->Location() + 1) == newWord->Location())
- {
- HtWordReference *result = new HtWordReference(*oldWord);
+ oid = "";
+ int did = oldWord->DocID();
+ oid << did;
+ oid << "-";
+ int loc = oldWord->Location();
+ oid << loc+1;
+ if (newDict.Exists(oid))
+ {
+ newWord = (HtWordReference *)newDict.Find(oid);
+
+ HtWordReference *result = new HtWordReference(*oldWord);
- result->Flags(oldWord->Flags() & newWord->Flags());
- result->Location(newWord->Location());
+ result->Flags(oldWord->Flags() & newWord->Flags());
+ result->Location(newWord->Location());
- results->Add(result);
- }
+ results->Add(result);
}
}
+
+ newDict.Release();
if(debug) cerr << "old words count: " << oldWords->Count() << endl;
if(debug) cerr << "results count: " << results->Count() << endl;
--
Toivo Pedaste Email: [EMAIL PROTECTED]
University Communications Services, Phone: +61 8 9 380 2605
University of Western Australia Fax: +61 8 9 380 1109
"The time has come", the Walrus said, "to talk of many things"...
_______________________________________________
htdig-dev mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/htdig-dev