Re: IndexOptimizer (Re: Lucene performance bottlenecks)

Doug Cutting Tue, 13 Dec 2005 16:03:11 -0800

Andrzej Bialecki wrote:

Ok, I just tested IndexSorter for now. It appears to work correctly, atleast I get exactly the same results, with the same scores and the sameexplanations, if I run the smae queries on the original and on thesorted index.

Here's a more complete version, still mostly untested. This should makesearches faster. We'll see how much good the results are...

This includes a patch to Lucene to make it easier to write hitcollectors that collect TopDocs.


I'll test this on a 38M document index tomorrow.

Cheers,

Doug

Index: src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
===================================================================
--- src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java	(revision 356540)
+++ src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java	(working copy)
@@ -22,6 +22,8 @@
 import org.apache.lucene.index.Term;
 import org.apache.lucene.misc.ChainedFilter;
 
+import org.apache.nutch.util.NutchConf;
+
 import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.ArrayList;
@@ -34,6 +36,30 @@
  * accellerates query constraints like date, language, document format, etc.,
  * which do not affect ranking but might otherwise slow search considerably. */
 class LuceneQueryOptimizer {
+
+  private static int MAX_HITS =
+    NutchConf.get().getInt("searcher.max.hits", 1000);
+
+  private static class LimitExceeded extends RuntimeException {
+    private int maxDoc;
+    public LimitExceeded(int maxDoc) { this.maxDoc = maxDoc; }    
+  }
+  
+  private static class LimitedCollector extends TopDocCollector {
+    private int maxHits;
+    
+    public LimitedCollector(int numHits, int maxHits) {
+      super(maxHits);
+      this.maxHits = maxHits;
+    }
+
+    public void collect(int doc, float score) {
+      if (getTotalHits() >= maxHits)
+        throw new LimitExceeded(doc);
+      super.collect(doc, score);
+    }
+  }
+
   private LinkedHashMap cache;                   // an LRU cache of QueryFilter
 
   private float threshold;
@@ -123,9 +149,21 @@
         }
       }        
     }
+    if (sortField == null && !reverse) {
+      LimitedCollector collector = new LimitedCollector(numHits, MAX_HITS);
+      LimitExceeded exceeded = null;
+      try {
+        searcher.search(query, filter, collector);
+      } catch (LimitExceeded le) {
+        exceeded = le;
+      }
+      TopDocs results = collector.topDocs();
+      if (exceeded != null) {                     // limit was exceeded
+        results.totalHits = (int)                 // must estimate totalHits
+          (results.totalHits*(searcher.maxDoc()/(float)exceeded.maxDoc));
+      }
+      return results;
 
-    if (sortField == null && !reverse) {
-      return searcher.search(query, filter, numHits);
     } else {
       return searcher.search(query, filter, numHits,
                              new Sort(sortField, reverse));
Index: src/java/org/apache/nutch/searcher/NutchBean.java
===================================================================
--- src/java/org/apache/nutch/searcher/NutchBean.java	(revision 356540)
+++ src/java/org/apache/nutch/searcher/NutchBean.java	(working copy)
@@ -350,13 +350,13 @@
 
     Hits hits = bean.search(query, 10);
     System.out.println("Total hits: " + hits.getTotal());
-    int length = (int)Math.min(hits.getTotal(), 10);
+    int length = (int)Math.min(hits.getTotal(), Math.min(hits.getLength(), 10));
     Hit[] show = hits.getHits(0, length);
     HitDetails[] details = bean.getDetails(show);
     String[] summaries = bean.getSummary(details, query);
 
-    for (int i = 0; i < hits.getLength(); i++) {
-      System.out.println(" "+i+" "+ details[i] + "\n" + summaries[i]);
+    for (int i = 0; i < length; i++) {
+      System.out.println(" "+i+" "+details[i]);
     }
   }
 
Index: src/java/org/apache/nutch/indexer/IndexSorter.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexSorter.java	(revision 0)
+++ src/java/org/apache/nutch/indexer/IndexSorter.java	(revision 0)
@@ -0,0 +1,295 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Date;
+import java.util.Arrays;
+
+import org.apache.lucene.index.*;
+import org.apache.lucene.document.*;
+import org.apache.lucene.store.*;
+import org.apache.lucene.search.*;
+
+import org.apache.nutch.util.NutchConf;
+
+/** Sort a Nutch index by page score.  Higher scoring documents are assigned
+ * smaller document numbers. */
+public class IndexSorter {
+
+  private static class PostingMap implements Comparable {
+    private int newDoc;
+    private long offset;
+
+    public int compareTo(Object o) {              // order by newDoc id
+      return this.newDoc - ((PostingMap)o).newDoc;
+    }
+  }
+
+  private static class SortedTermPositions implements TermPositions {
+    private TermPositions original;
+    private int[] oldToNew;
+
+    private int docFreq;
+
+    private PostingMap[] postingMaps = new PostingMap[0];
+    private int pointer;
+
+    private int freq;
+    private int position;
+
+    private static final String TEMP_FILE = "temp";
+    private final RAMDirectory tempDir = new RAMDirectory();
+    private final RAMOutputStream out =
+      (RAMOutputStream)tempDir.createOutput(TEMP_FILE);
+    private IndexInput in;
+
+    public SortedTermPositions(TermPositions original, int[] oldToNew) {
+      this.original = original;
+      this.oldToNew = oldToNew;
+    }
+
+    public void seek(Term term) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    public void seek(TermEnum terms) throws IOException {
+      original.seek(terms);
+
+      docFreq = terms.docFreq();
+      pointer = -1;
+
+      if (docFreq > postingMaps.length) {         // grow postingsMap
+        PostingMap[] newMap = new PostingMap[docFreq];
+        System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length);
+        for (int i = postingMaps.length; i < docFreq; i++) {
+          newMap[i] = new PostingMap();
+        }
+        postingMaps = newMap;
+      }
+
+      out.reset();
+
+      int i = 0;
+      while (original.next()) {
+        PostingMap map = postingMaps[i++];
+        map.newDoc = oldToNew[original.doc()];    // remap the newDoc id
+        map.offset = out.getFilePointer();        // save pointer to buffer
+
+        final int tf = original.freq();           // buffer tf & positions
+        out.writeVInt(tf);
+        int prevPosition = 0;
+        for (int j = tf; j > 0; j--) {            // delta encode positions
+          int p = original.nextPosition();
+          out.writeVInt(p - prevPosition);
+          prevPosition = p;
+        }
+      }
+      out.flush();
+      docFreq = i;                                // allow for deletions
+      
+      Arrays.sort(postingMaps, 0, docFreq);       // resort by mapped doc ids
+
+      // NOTE: this might be substantially faster if RAMInputStream were public
+      // and supported a reset() operation.
+      in = tempDir.openInput(TEMP_FILE);
+    }
+        
+    public boolean next() throws IOException {
+      pointer++;
+      if (pointer < docFreq) {
+        in.seek(postingMaps[pointer].offset);
+        freq = in.readVInt();
+        position = 0;
+        return true;
+      }
+      return false;
+    }
+      
+    public int doc() { return postingMaps[pointer].newDoc; }
+    public int freq() { return freq; }
+
+    public int nextPosition() throws IOException {
+      int positionIncrement = in.readVInt();
+      position += positionIncrement;
+      return position;
+    }
+
+    public int read(int[] docs, int[] freqs) {
+      throw new UnsupportedOperationException();
+    }
+    public boolean skipTo(int target) {
+      throw new UnsupportedOperationException();
+    }
+
+    public void close() throws IOException {
+      original.close();
+    }
+
+  }
+
+  private static class SortingReader extends FilterIndexReader {
+    
+    private int[] oldToNew;
+    private int[] newToOld;
+
+    public SortingReader(IndexReader oldReader, int[] oldToNew) {
+      super(oldReader);
+      this.oldToNew = oldToNew;
+      
+      this.newToOld = new int[oldReader.numDocs()];
+      int oldDoc = 0;
+      while (oldDoc < oldToNew.length) {
+        int newDoc = oldToNew[oldDoc];
+        if (newDoc != -1) {
+          newToOld[newDoc] = oldDoc;
+        }
+        oldDoc++;
+      }
+    }
+
+    public Document document(int n) throws IOException {
+      return super.document(newToOld[n]);
+    }
+
+    public boolean isDeleted(int n) {
+      return false;
+    }
+
+    public byte[] norms(String f) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    public void norms(String f, byte[] norms, int offset) throws IOException {
+      byte[] oldNorms = super.norms(f);
+      int oldDoc = 0;
+      while (oldDoc < oldNorms.length) {
+        int newDoc = oldToNew[oldDoc];
+        if (newDoc != -1) {
+          norms[newDoc] = oldNorms[oldDoc];
+        }
+        oldDoc++;
+      }
+    }
+
+    protected void doSetNorm(int d, String f, byte b) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    public TermDocs termDocs() throws IOException {
+      throw new UnsupportedOperationException();
+    }
+    
+    public TermPositions termPositions() throws IOException {
+      return new SortedTermPositions(super.termPositions(), oldToNew);
+    }
+
+    protected void doDelete(int n) throws IOException { 
+      throw new UnsupportedOperationException();
+    }
+
+  }
+
+  private static class DocScore implements Comparable {
+    private int oldDoc;
+    private float score;
+
+    public int compareTo(Object o) {              // order by score, oldDoc
+      DocScore that = (DocScore)o;
+      if (this.score == that.score) {
+        return this.oldDoc - that.oldDoc;
+      } else {
+        return this.score < that.score ? 1 : -1 ;
+      }
+    }
+  }
+
+  private File directory;
+
+  public IndexSorter(File directory) {
+    this.directory = directory;
+  }
+
+  public void sort() throws IOException {
+    IndexReader reader = IndexReader.open(new File(directory, "index"));
+
+    SortingReader sorter = new SortingReader(reader, oldToNew(reader));
+    IndexWriter writer = new IndexWriter(new File(directory, "index-sorted"),
+                                         null, true);
+    writer.setTermIndexInterval
+      (NutchConf.get().getInt("indexer.termIndexInterval", 128));
+    writer.setUseCompoundFile(false);
+    writer.addIndexes(new IndexReader[] { sorter });
+    writer.close();
+  }
+
+  private static int[] oldToNew(IndexReader reader) throws IOException {
+    int readerMax = reader.maxDoc();
+    DocScore[] newToOld = new DocScore[readerMax];
+
+    // use site, an indexed, un-tokenized field to get boost
+    byte[] boosts = reader.norms("site");          
+
+    for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) {
+      float score;
+      if (reader.isDeleted(oldDoc)) {
+        score = 0.0f;
+      } else {
+        score = Similarity.decodeNorm(boosts[oldDoc]);
+      }
+      DocScore docScore = new DocScore();
+      docScore.oldDoc = oldDoc;
+      docScore.score = score;
+      newToOld[oldDoc] = docScore;
+    }
+    Arrays.sort(newToOld);
+
+    int[] oldToNew = new int[readerMax];
+    for (int newDoc = 0; newDoc < readerMax; newDoc++) {
+      DocScore docScore = newToOld[newDoc];
+      oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1;
+    }    
+    return oldToNew;
+  }
+
+  /** */
+  public static void main(String[] args) throws Exception {
+    File directory;
+      
+    String usage = "IndexSorter directory";
+
+    if (args.length < 1) {
+      System.err.println("Usage: " + usage);
+      return;
+    }
+
+    directory = new File(args[0]);
+
+    IndexSorter sorter = new IndexSorter(directory);
+
+    Date start = new Date();
+
+    sorter.sort();
+
+    Date end = new Date();
+
+    System.out.print(end.getTime() - start.getTime());
+    System.out.println(" total milliseconds");
+  }
+
+}

Index: src/java/org/apache/lucene/search/TopFieldDocCollector.java
===================================================================
--- src/java/org/apache/lucene/search/TopFieldDocCollector.java	(revision 0)
+++ src/java/org/apache/lucene/search/TopFieldDocCollector.java	(revision 0)
@@ -0,0 +1,47 @@
+package org.apache.lucene.search;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.BitSet;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+
+public class TopFieldDocCollector extends TopDocCollector {
+
+  public TopFieldDocCollector(IndexReader reader, Sort sort, int numHits)
+    throws IOException {
+    super(numHits, new FieldSortedHitQueue(reader, sort.fields, numHits));
+  }
+
+  public void collect(int doc, float score) {
+    if (score > 0.0f) {
+      totalHits++;
+      hq.insert(new FieldDoc(doc, score));
+    }
+  }
+
+  public TopDocs topDocs() {
+    TopDocs topDocs = super.topDocs();
+    return new TopFieldDocs(topDocs.totalHits, topDocs.scoreDocs,
+                            ((FieldSortedHitQueue)hq).getFields(),
+                            topDocs.getMaxScore());
+  }
+}
Index: src/java/org/apache/lucene/search/TopDocCollector.java
===================================================================
--- src/java/org/apache/lucene/search/TopDocCollector.java	(revision 0)
+++ src/java/org/apache/lucene/search/TopDocCollector.java	(revision 0)
@@ -0,0 +1,67 @@
+package org.apache.lucene.search;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.BitSet;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.util.PriorityQueue;
+
+public class TopDocCollector extends HitCollector {
+  private int numHits;
+  private float minScore = 0.0f;
+
+  int totalHits;
+  PriorityQueue hq;
+    
+  public TopDocCollector(int numHits) {
+    this(numHits, new HitQueue(numHits));
+  }
+
+  TopDocCollector(int numHits, PriorityQueue hq) {
+    this.numHits = numHits;
+    this.hq = hq;
+  }
+
+  public void collect(int doc, float score) {
+    if (score > 0.0f) {
+      totalHits++;
+      if (hq.size() < numHits || score >= minScore) {
+        hq.insert(new ScoreDoc(doc, score));
+        minScore = ((ScoreDoc)hq.top()).score; // maintain minScore
+      }
+    }
+  }
+
+  public int getTotalHits() { return totalHits; }
+
+  public TopDocs topDocs() {
+    ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
+    for (int i = hq.size()-1; i >= 0; i--)      // put docs in array
+      scoreDocs[i] = (ScoreDoc)hq.pop();
+      
+    float maxScore = (totalHits==0)
+      ? Float.NEGATIVE_INFINITY
+      : scoreDocs[0].score;
+    
+    return new TopDocs(totalHits, scoreDocs, maxScore);
+  }
+}
Index: src/java/org/apache/lucene/search/IndexSearcher.java
===================================================================
--- src/java/org/apache/lucene/search/IndexSearcher.java	(revision 356606)
+++ src/java/org/apache/lucene/search/IndexSearcher.java	(working copy)
@@ -95,63 +95,20 @@
     if (nDocs <= 0)  // null might be returned from hq.top() below.
       throw new IllegalArgumentException("nDocs must be > 0");
 
-    Scorer scorer = weight.scorer(reader);
-    if (scorer == null)
-      return new TopDocs(0, new ScoreDoc[0], Float.NEGATIVE_INFINITY);
-
-    final BitSet bits = filter != null ? filter.bits(reader) : null;
-    final HitQueue hq = new HitQueue(nDocs);
-    final int[] totalHits = new int[1];
-    scorer.score(new HitCollector() {
-        private float minScore = 0.0f;
-        public final void collect(int doc, float score) {
-          if (score > 0.0f &&                     // ignore zeroed buckets
-              (bits==null || bits.get(doc))) {    // skip docs not in bits
-            totalHits[0]++;
-            if (hq.size() < nDocs || score >= minScore) {
-              hq.insert(new ScoreDoc(doc, score));
-              minScore = ((ScoreDoc)hq.top()).score; // maintain minScore
-            }
-          }
-        }
-      });
-
-    ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
-    for (int i = hq.size()-1; i >= 0; i--)        // put docs in array
-      scoreDocs[i] = (ScoreDoc)hq.pop();
-
-    float maxScore = (totalHits[0]==0) ? Float.NEGATIVE_INFINITY : scoreDocs[0].score;
-    
-    return new TopDocs(totalHits[0], scoreDocs, maxScore);
+    TopDocCollector collector = new TopDocCollector(nDocs);
+    search(weight, filter, collector);
+    return collector.topDocs();
   }
 
   // inherit javadoc
   public TopFieldDocs search(Weight weight, Filter filter, final int nDocs,
                              Sort sort)
       throws IOException {
-    Scorer scorer = weight.scorer(reader);
-    if (scorer == null)
-      return new TopFieldDocs(0, new ScoreDoc[0], sort.fields, Float.NEGATIVE_INFINITY);
 
-    final BitSet bits = filter != null ? filter.bits(reader) : null;
-    final FieldSortedHitQueue hq =
-      new FieldSortedHitQueue(reader, sort.fields, nDocs);
-    final int[] totalHits = new int[1];
-    scorer.score(new HitCollector() {
-        public final void collect(int doc, float score) {
-          if (score > 0.0f &&			  // ignore zeroed buckets
-              (bits==null || bits.get(doc))) {	  // skip docs not in bits
-            totalHits[0]++;
-            hq.insert(new FieldDoc(doc, score));
-          }
-        }
-      });
-
-    ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
-    for (int i = hq.size()-1; i >= 0; i--)        // put docs in array
-      scoreDocs[i] = hq.fillFields ((FieldDoc) hq.pop());
-
-    return new TopFieldDocs(totalHits[0], scoreDocs, hq.getFields(), hq.getMaxScore());
+    TopFieldDocCollector collector =
+      new TopFieldDocCollector(reader, sort, nDocs);
+    search(weight, filter, collector);
+    return (TopFieldDocs)collector.topDocs();
   }
 
   // inherit javadoc

Re: IndexOptimizer (Re: Lucene performance bottlenecks)

Reply via email to