cutting 2004/09/22 10:56:01 Modified: src/java/org/apache/lucene/search BooleanScorer.java Scorer.java TermScorer.java Log: Optimize TermQuery within BooleanQuery, a common case. Revision Changes Path 1.12 +66 -26 jakarta-lucene/src/java/org/apache/lucene/search/BooleanScorer.java Index: BooleanScorer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/BooleanScorer.java,v retrieving revision 1.11 retrieving revision 1.12 diff -u -r1.11 -r1.12 --- BooleanScorer.java 16 Aug 2004 11:41:22 -0000 1.11 +++ BooleanScorer.java 22 Sep 2004 17:56:00 -0000 1.12 @@ -42,7 +42,7 @@ public SubScorer next; public SubScorer(Scorer scorer, boolean required, boolean prohibited, - HitCollector collector, SubScorer next) + HitCollector collector, SubScorer next) throws IOException { this.scorer = scorer; this.done = !scorer.next(); @@ -58,8 +58,8 @@ int mask = 0; if (required || prohibited) { if (nextMask == 0) - throw new IndexOutOfBoundsException - ("More than 32 required/prohibited clauses in query."); + throw new IndexOutOfBoundsException + ("More than 32 required/prohibited clauses in query."); mask = nextMask; nextMask = nextMask << 1; } else @@ -69,12 +69,12 @@ maxCoord++; if (prohibited) - prohibitedMask |= mask; // update prohibited mask + prohibitedMask |= mask; // update prohibited mask else if (required) - requiredMask |= mask; // update required mask + requiredMask |= mask; // update required mask scorers = new SubScorer(scorer, required, prohibited, - bucketTable.newCollector(mask), scorers); + bucketTable.newCollector(mask), scorers); } private final void computeCoordFactors() { @@ -86,6 +86,46 @@ private int end; private Bucket current; + public void score(HitCollector hc) throws IOException { + score(hc, Integer.MAX_VALUE); + } + + protected boolean score(HitCollector hc, int max) throws IOException { + if (coordFactors == null) + computeCoordFactors(); + + boolean more; + do { + while (bucketTable.first != null) { // more queued + current = bucketTable.first; + if (current.doc >= max) + return true; + + // check prohibited & required + if ((current.bits & prohibitedMask) == 0 && + (current.bits & requiredMask) == requiredMask) { + hc.collect(current.doc, current.score * coordFactors[current.coord]); + } + + bucketTable.first = current.next; // pop the queue + } + + // refill the queue + more = false; + end += BucketTable.SIZE; + for (SubScorer sub = scorers; sub != null; sub = sub.next) { + if (!sub.done) { + sub.done = !sub.scorer.score(sub.collector, end); + if (!sub.done) + more = true; + } + } + } while (bucketTable.first != null || more); + + return false; + } + + public int doc() { return current.doc; } public boolean next() throws IOException { @@ -127,20 +167,20 @@ } static final class Bucket { - int doc = -1; // tells if bucket is valid - float score; // incremental score - int bits; // used for bool constraints - int coord; // count of terms in score - Bucket next; // next valid bucket + int doc = -1; // tells if bucket is valid + float score; // incremental score + int bits; // used for bool constraints + int coord; // count of terms in score + Bucket next; // next valid bucket } /** A simple hash table of document scores within a range. */ static final class BucketTable { - public static final int SIZE = 1 << 10; + public static final int SIZE = 1 << 8; public static final int MASK = SIZE - 1; final Bucket[] buckets = new Bucket[SIZE]; - Bucket first = null; // head of valid list + Bucket first = null; // head of valid list private BooleanScorer scorer; @@ -167,20 +207,20 @@ final int i = doc & BucketTable.MASK; Bucket bucket = table.buckets[i]; if (bucket == null) - table.buckets[i] = bucket = new Bucket(); + table.buckets[i] = bucket = new Bucket(); - if (bucket.doc != doc) { // invalid bucket - bucket.doc = doc; // set doc - bucket.score = score; // initialize score - bucket.bits = mask; // initialize mask - bucket.coord = 1; // initialize coord - - bucket.next = table.first; // push onto valid list - table.first = bucket; - } else { // valid bucket - bucket.score += score; // increment score - bucket.bits |= mask; // add bits in mask - bucket.coord++; // increment coord + if (bucket.doc != doc) { // invalid bucket + bucket.doc = doc; // set doc + bucket.score = score; // initialize score + bucket.bits = mask; // initialize mask + bucket.coord = 1; // initialize coord + + bucket.next = table.first; // push onto valid list + table.first = bucket; + } else { // valid bucket + bucket.score += score; // increment score + bucket.bits |= mask; // add bits in mask + bucket.coord++; // increment coord } } } 1.7 +17 -0 jakarta-lucene/src/java/org/apache/lucene/search/Scorer.java Index: Scorer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/Scorer.java,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- Scorer.java 17 Aug 2004 20:38:45 -0000 1.6 +++ Scorer.java 22 Sep 2004 17:56:00 -0000 1.7 @@ -48,6 +48,23 @@ } } + /** Expert: Collects matching documents in a range. Hook for optimization. + * Note that [EMAIL PROTECTED] #next()} must be called once before this method is called + * for the first time. + * @param hc The collector to which all matching documents are passed through + * [EMAIL PROTECTED] HitCollector#collect(int, float)}. + * @param max Do not score documents past this. + * @return true if more matching documents may remain. + */ + protected boolean score(HitCollector hc, int max) throws IOException { + while (doc() < max) { + hc.collect(doc(), score()); + if (!next()) + return false; + } + return true; + } + /** Advances to the next document matching the query. * @return true iff there is another document matching the query. */ 1.13 +38 -5 jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java Index: TermScorer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java,v retrieving revision 1.12 retrieving revision 1.13 diff -u -r1.12 -r1.13 --- TermScorer.java 17 Aug 2004 20:38:45 -0000 1.12 +++ TermScorer.java 22 Sep 2004 17:56:00 -0000 1.13 @@ -29,8 +29,8 @@ private float weightValue; private int doc; - private final int[] docs = new int[32]; // buffered doc numbers - private final int[] freqs = new int[32]; // buffered term freqs + private final int[] docs = new int[32]; // buffered doc numbers + private final int[] freqs = new int[32]; // buffered term freqs private int pointer; private int pointerMax; @@ -55,6 +55,39 @@ scoreCache[i] = getSimilarity().tf(i) * weightValue; } + public void score(HitCollector hc) throws IOException { + next(); + score(hc, Integer.MAX_VALUE); + } + + protected boolean score(HitCollector c, int end) throws IOException { + Similarity similarity = getSimilarity(); // cache sim in local + while (doc < end) { // for docs in window + int f = freqs[pointer]; + float score = // compute tf(f)*weight + f < SCORE_CACHE_SIZE // check cache + ? scoreCache[f] // cache hit + : similarity.tf(f)*weightValue; // cache miss + + score *= Similarity.decodeNorm(norms[doc]); // normalize for field + + c.collect(doc, score); // collect score + + if (++pointer >= pointerMax) { + pointerMax = termDocs.read(docs, freqs); // refill buffers + if (pointerMax != 0) { + pointer = 0; + } else { + termDocs.close(); // close stream + doc = Integer.MAX_VALUE; // set to sentinel value + return false; + } + } + doc = docs[pointer]; + } + return true; + } + /** Returns the current document number matching the query. * Initially invalid, until [EMAIL PROTECTED] #next()} is called the first time. */ @@ -72,8 +105,8 @@ if (pointerMax != 0) { pointer = 0; } else { - termDocs.close(); // close stream - doc = Integer.MAX_VALUE; // set to sentinel value + termDocs.close(); // close stream + doc = Integer.MAX_VALUE; // set to sentinel value return false; } } @@ -84,7 +117,7 @@ public float score() { int f = freqs[pointer]; float raw = // compute tf(f)*weight - f < SCORE_CACHE_SIZE // check cache + f < SCORE_CACHE_SIZE // check cache ? scoreCache[f] // cache hit : getSimilarity().tf(f)*weightValue; // cache miss
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]