Andrzej Bialecki wrote:
Ok, I just tested IndexSorter for now. It appears to work correctly, at
least I get exactly the same results, with the same scores and the same
explanations, if I run the smae queries on the original and on the
sorted index.
Here's a more complete version, still mostly untested. This should make
searches faster. We'll see how much good the results are...
This includes a patch to Lucene to make it easier to write hit
collectors that collect TopDocs.
I'll test this on a 38M document index tomorrow.
Cheers,
Doug
Index: src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
===================================================================
--- src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (revision 356540)
+++ src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (working copy)
@@ -22,6 +22,8 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.misc.ChainedFilter;
+import org.apache.nutch.util.NutchConf;
+
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.ArrayList;
@@ -34,6 +36,30 @@
* accellerates query constraints like date, language, document format, etc.,
* which do not affect ranking but might otherwise slow search considerably. */
class LuceneQueryOptimizer {
+
+ private static int MAX_HITS =
+ NutchConf.get().getInt("searcher.max.hits", 1000);
+
+ private static class LimitExceeded extends RuntimeException {
+ private int maxDoc;
+ public LimitExceeded(int maxDoc) { this.maxDoc = maxDoc; }
+ }
+
+ private static class LimitedCollector extends TopDocCollector {
+ private int maxHits;
+
+ public LimitedCollector(int numHits, int maxHits) {
+ super(maxHits);
+ this.maxHits = maxHits;
+ }
+
+ public void collect(int doc, float score) {
+ if (getTotalHits() >= maxHits)
+ throw new LimitExceeded(doc);
+ super.collect(doc, score);
+ }
+ }
+
private LinkedHashMap cache; // an LRU cache of QueryFilter
private float threshold;
@@ -123,9 +149,21 @@
}
}
}
+ if (sortField == null && !reverse) {
+ LimitedCollector collector = new LimitedCollector(numHits, MAX_HITS);
+ LimitExceeded exceeded = null;
+ try {
+ searcher.search(query, filter, collector);
+ } catch (LimitExceeded le) {
+ exceeded = le;
+ }
+ TopDocs results = collector.topDocs();
+ if (exceeded != null) { // limit was exceeded
+ results.totalHits = (int) // must estimate totalHits
+ (results.totalHits*(searcher.maxDoc()/(float)exceeded.maxDoc));
+ }
+ return results;
- if (sortField == null && !reverse) {
- return searcher.search(query, filter, numHits);
} else {
return searcher.search(query, filter, numHits,
new Sort(sortField, reverse));
Index: src/java/org/apache/nutch/searcher/NutchBean.java
===================================================================
--- src/java/org/apache/nutch/searcher/NutchBean.java (revision 356540)
+++ src/java/org/apache/nutch/searcher/NutchBean.java (working copy)
@@ -350,13 +350,13 @@
Hits hits = bean.search(query, 10);
System.out.println("Total hits: " + hits.getTotal());
- int length = (int)Math.min(hits.getTotal(), 10);
+ int length = (int)Math.min(hits.getTotal(), Math.min(hits.getLength(), 10));
Hit[] show = hits.getHits(0, length);
HitDetails[] details = bean.getDetails(show);
String[] summaries = bean.getSummary(details, query);
- for (int i = 0; i < hits.getLength(); i++) {
- System.out.println(" "+i+" "+ details[i] + "\n" + summaries[i]);
+ for (int i = 0; i < length; i++) {
+ System.out.println(" "+i+" "+details[i]);
}
}
Index: src/java/org/apache/nutch/indexer/IndexSorter.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexSorter.java (revision 0)
+++ src/java/org/apache/nutch/indexer/IndexSorter.java (revision 0)
@@ -0,0 +1,295 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Date;
+import java.util.Arrays;
+
+import org.apache.lucene.index.*;
+import org.apache.lucene.document.*;
+import org.apache.lucene.store.*;
+import org.apache.lucene.search.*;
+
+import org.apache.nutch.util.NutchConf;
+
+/** Sort a Nutch index by page score. Higher scoring documents are assigned
+ * smaller document numbers. */
+public class IndexSorter {
+
+ private static class PostingMap implements Comparable {
+ private int newDoc;
+ private long offset;
+
+ public int compareTo(Object o) { // order by newDoc id
+ return this.newDoc - ((PostingMap)o).newDoc;
+ }
+ }
+
+ private static class SortedTermPositions implements TermPositions {
+ private TermPositions original;
+ private int[] oldToNew;
+
+ private int docFreq;
+
+ private PostingMap[] postingMaps = new PostingMap[0];
+ private int pointer;
+
+ private int freq;
+ private int position;
+
+ private static final String TEMP_FILE = "temp";
+ private final RAMDirectory tempDir = new RAMDirectory();
+ private final RAMOutputStream out =
+ (RAMOutputStream)tempDir.createOutput(TEMP_FILE);
+ private IndexInput in;
+
+ public SortedTermPositions(TermPositions original, int[] oldToNew) {
+ this.original = original;
+ this.oldToNew = oldToNew;
+ }
+
+ public void seek(Term term) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ public void seek(TermEnum terms) throws IOException {
+ original.seek(terms);
+
+ docFreq = terms.docFreq();
+ pointer = -1;
+
+ if (docFreq > postingMaps.length) { // grow postingsMap
+ PostingMap[] newMap = new PostingMap[docFreq];
+ System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length);
+ for (int i = postingMaps.length; i < docFreq; i++) {
+ newMap[i] = new PostingMap();
+ }
+ postingMaps = newMap;
+ }
+
+ out.reset();
+
+ int i = 0;
+ while (original.next()) {
+ PostingMap map = postingMaps[i++];
+ map.newDoc = oldToNew[original.doc()]; // remap the newDoc id
+ map.offset = out.getFilePointer(); // save pointer to buffer
+
+ final int tf = original.freq(); // buffer tf & positions
+ out.writeVInt(tf);
+ int prevPosition = 0;
+ for (int j = tf; j > 0; j--) { // delta encode positions
+ int p = original.nextPosition();
+ out.writeVInt(p - prevPosition);
+ prevPosition = p;
+ }
+ }
+ out.flush();
+ docFreq = i; // allow for deletions
+
+ Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids
+
+ // NOTE: this might be substantially faster if RAMInputStream were public
+ // and supported a reset() operation.
+ in = tempDir.openInput(TEMP_FILE);
+ }
+
+ public boolean next() throws IOException {
+ pointer++;
+ if (pointer < docFreq) {
+ in.seek(postingMaps[pointer].offset);
+ freq = in.readVInt();
+ position = 0;
+ return true;
+ }
+ return false;
+ }
+
+ public int doc() { return postingMaps[pointer].newDoc; }
+ public int freq() { return freq; }
+
+ public int nextPosition() throws IOException {
+ int positionIncrement = in.readVInt();
+ position += positionIncrement;
+ return position;
+ }
+
+ public int read(int[] docs, int[] freqs) {
+ throw new UnsupportedOperationException();
+ }
+ public boolean skipTo(int target) {
+ throw new UnsupportedOperationException();
+ }
+
+ public void close() throws IOException {
+ original.close();
+ }
+
+ }
+
+ private static class SortingReader extends FilterIndexReader {
+
+ private int[] oldToNew;
+ private int[] newToOld;
+
+ public SortingReader(IndexReader oldReader, int[] oldToNew) {
+ super(oldReader);
+ this.oldToNew = oldToNew;
+
+ this.newToOld = new int[oldReader.numDocs()];
+ int oldDoc = 0;
+ while (oldDoc < oldToNew.length) {
+ int newDoc = oldToNew[oldDoc];
+ if (newDoc != -1) {
+ newToOld[newDoc] = oldDoc;
+ }
+ oldDoc++;
+ }
+ }
+
+ public Document document(int n) throws IOException {
+ return super.document(newToOld[n]);
+ }
+
+ public boolean isDeleted(int n) {
+ return false;
+ }
+
+ public byte[] norms(String f) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ public void norms(String f, byte[] norms, int offset) throws IOException {
+ byte[] oldNorms = super.norms(f);
+ int oldDoc = 0;
+ while (oldDoc < oldNorms.length) {
+ int newDoc = oldToNew[oldDoc];
+ if (newDoc != -1) {
+ norms[newDoc] = oldNorms[oldDoc];
+ }
+ oldDoc++;
+ }
+ }
+
+ protected void doSetNorm(int d, String f, byte b) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ public TermDocs termDocs() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ public TermPositions termPositions() throws IOException {
+ return new SortedTermPositions(super.termPositions(), oldToNew);
+ }
+
+ protected void doDelete(int n) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ }
+
+ private static class DocScore implements Comparable {
+ private int oldDoc;
+ private float score;
+
+ public int compareTo(Object o) { // order by score, oldDoc
+ DocScore that = (DocScore)o;
+ if (this.score == that.score) {
+ return this.oldDoc - that.oldDoc;
+ } else {
+ return this.score < that.score ? 1 : -1 ;
+ }
+ }
+ }
+
+ private File directory;
+
+ public IndexSorter(File directory) {
+ this.directory = directory;
+ }
+
+ public void sort() throws IOException {
+ IndexReader reader = IndexReader.open(new File(directory, "index"));
+
+ SortingReader sorter = new SortingReader(reader, oldToNew(reader));
+ IndexWriter writer = new IndexWriter(new File(directory, "index-sorted"),
+ null, true);
+ writer.setTermIndexInterval
+ (NutchConf.get().getInt("indexer.termIndexInterval", 128));
+ writer.setUseCompoundFile(false);
+ writer.addIndexes(new IndexReader[] { sorter });
+ writer.close();
+ }
+
+ private static int[] oldToNew(IndexReader reader) throws IOException {
+ int readerMax = reader.maxDoc();
+ DocScore[] newToOld = new DocScore[readerMax];
+
+ // use site, an indexed, un-tokenized field to get boost
+ byte[] boosts = reader.norms("site");
+
+ for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) {
+ float score;
+ if (reader.isDeleted(oldDoc)) {
+ score = 0.0f;
+ } else {
+ score = Similarity.decodeNorm(boosts[oldDoc]);
+ }
+ DocScore docScore = new DocScore();
+ docScore.oldDoc = oldDoc;
+ docScore.score = score;
+ newToOld[oldDoc] = docScore;
+ }
+ Arrays.sort(newToOld);
+
+ int[] oldToNew = new int[readerMax];
+ for (int newDoc = 0; newDoc < readerMax; newDoc++) {
+ DocScore docScore = newToOld[newDoc];
+ oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1;
+ }
+ return oldToNew;
+ }
+
+ /** */
+ public static void main(String[] args) throws Exception {
+ File directory;
+
+ String usage = "IndexSorter directory";
+
+ if (args.length < 1) {
+ System.err.println("Usage: " + usage);
+ return;
+ }
+
+ directory = new File(args[0]);
+
+ IndexSorter sorter = new IndexSorter(directory);
+
+ Date start = new Date();
+
+ sorter.sort();
+
+ Date end = new Date();
+
+ System.out.print(end.getTime() - start.getTime());
+ System.out.println(" total milliseconds");
+ }
+
+}
Index: src/java/org/apache/lucene/search/TopFieldDocCollector.java
===================================================================
--- src/java/org/apache/lucene/search/TopFieldDocCollector.java (revision 0)
+++ src/java/org/apache/lucene/search/TopFieldDocCollector.java (revision 0)
@@ -0,0 +1,47 @@
+package org.apache.lucene.search;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.BitSet;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+
+public class TopFieldDocCollector extends TopDocCollector {
+
+ public TopFieldDocCollector(IndexReader reader, Sort sort, int numHits)
+ throws IOException {
+ super(numHits, new FieldSortedHitQueue(reader, sort.fields, numHits));
+ }
+
+ public void collect(int doc, float score) {
+ if (score > 0.0f) {
+ totalHits++;
+ hq.insert(new FieldDoc(doc, score));
+ }
+ }
+
+ public TopDocs topDocs() {
+ TopDocs topDocs = super.topDocs();
+ return new TopFieldDocs(topDocs.totalHits, topDocs.scoreDocs,
+ ((FieldSortedHitQueue)hq).getFields(),
+ topDocs.getMaxScore());
+ }
+}
Index: src/java/org/apache/lucene/search/TopDocCollector.java
===================================================================
--- src/java/org/apache/lucene/search/TopDocCollector.java (revision 0)
+++ src/java/org/apache/lucene/search/TopDocCollector.java (revision 0)
@@ -0,0 +1,67 @@
+package org.apache.lucene.search;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.BitSet;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.util.PriorityQueue;
+
+public class TopDocCollector extends HitCollector {
+ private int numHits;
+ private float minScore = 0.0f;
+
+ int totalHits;
+ PriorityQueue hq;
+
+ public TopDocCollector(int numHits) {
+ this(numHits, new HitQueue(numHits));
+ }
+
+ TopDocCollector(int numHits, PriorityQueue hq) {
+ this.numHits = numHits;
+ this.hq = hq;
+ }
+
+ public void collect(int doc, float score) {
+ if (score > 0.0f) {
+ totalHits++;
+ if (hq.size() < numHits || score >= minScore) {
+ hq.insert(new ScoreDoc(doc, score));
+ minScore = ((ScoreDoc)hq.top()).score; // maintain minScore
+ }
+ }
+ }
+
+ public int getTotalHits() { return totalHits; }
+
+ public TopDocs topDocs() {
+ ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
+ for (int i = hq.size()-1; i >= 0; i--) // put docs in array
+ scoreDocs[i] = (ScoreDoc)hq.pop();
+
+ float maxScore = (totalHits==0)
+ ? Float.NEGATIVE_INFINITY
+ : scoreDocs[0].score;
+
+ return new TopDocs(totalHits, scoreDocs, maxScore);
+ }
+}
Index: src/java/org/apache/lucene/search/IndexSearcher.java
===================================================================
--- src/java/org/apache/lucene/search/IndexSearcher.java (revision 356606)
+++ src/java/org/apache/lucene/search/IndexSearcher.java (working copy)
@@ -95,63 +95,20 @@
if (nDocs <= 0) // null might be returned from hq.top() below.
throw new IllegalArgumentException("nDocs must be > 0");
- Scorer scorer = weight.scorer(reader);
- if (scorer == null)
- return new TopDocs(0, new ScoreDoc[0], Float.NEGATIVE_INFINITY);
-
- final BitSet bits = filter != null ? filter.bits(reader) : null;
- final HitQueue hq = new HitQueue(nDocs);
- final int[] totalHits = new int[1];
- scorer.score(new HitCollector() {
- private float minScore = 0.0f;
- public final void collect(int doc, float score) {
- if (score > 0.0f && // ignore zeroed buckets
- (bits==null || bits.get(doc))) { // skip docs not in bits
- totalHits[0]++;
- if (hq.size() < nDocs || score >= minScore) {
- hq.insert(new ScoreDoc(doc, score));
- minScore = ((ScoreDoc)hq.top()).score; // maintain minScore
- }
- }
- }
- });
-
- ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
- for (int i = hq.size()-1; i >= 0; i--) // put docs in array
- scoreDocs[i] = (ScoreDoc)hq.pop();
-
- float maxScore = (totalHits[0]==0) ? Float.NEGATIVE_INFINITY : scoreDocs[0].score;
-
- return new TopDocs(totalHits[0], scoreDocs, maxScore);
+ TopDocCollector collector = new TopDocCollector(nDocs);
+ search(weight, filter, collector);
+ return collector.topDocs();
}
// inherit javadoc
public TopFieldDocs search(Weight weight, Filter filter, final int nDocs,
Sort sort)
throws IOException {
- Scorer scorer = weight.scorer(reader);
- if (scorer == null)
- return new TopFieldDocs(0, new ScoreDoc[0], sort.fields, Float.NEGATIVE_INFINITY);
- final BitSet bits = filter != null ? filter.bits(reader) : null;
- final FieldSortedHitQueue hq =
- new FieldSortedHitQueue(reader, sort.fields, nDocs);
- final int[] totalHits = new int[1];
- scorer.score(new HitCollector() {
- public final void collect(int doc, float score) {
- if (score > 0.0f && // ignore zeroed buckets
- (bits==null || bits.get(doc))) { // skip docs not in bits
- totalHits[0]++;
- hq.insert(new FieldDoc(doc, score));
- }
- }
- });
-
- ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
- for (int i = hq.size()-1; i >= 0; i--) // put docs in array
- scoreDocs[i] = hq.fillFields ((FieldDoc) hq.pop());
-
- return new TopFieldDocs(totalHits[0], scoreDocs, hq.getFields(), hq.getMaxScore());
+ TopFieldDocCollector collector =
+ new TopFieldDocCollector(reader, sort, nDocs);
+ search(weight, filter, collector);
+ return (TopFieldDocs)collector.topDocs();
}
// inherit javadoc