I haven't had a chance to do much on this lately (BigMultiTermScorer), so here is some code I had sitting around, unfinished & untested, but may stimulate discussion on the direction.
-Yonik package org.apache.lucene.search; import org.apache.lucene.index.*; import org.apache.lucene.util.SmallFloat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.analysis.standard.StandardAnalyzer; import java.io.IOException; import java.util.BitSet; import junit.framework.TestCase; class TestWildcardQuery2 extends TestCase { void addDocs(IndexWriter writer, int num, int range) { int id=0; for (int i=0; i<num; i++) { if (++id >= range) id=0; Document doc = new Document(); doc.add(new Field("id",Integer.toString(id), Field.Store.NO, Field.Index.UN_TOKENIZED)); } } Directory getIndex(int size) throws IOException { Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true); addDocs(writer,1000,1000); writer.close(); return dir; } public void testScore() { } } class WildcardQuery2 extends Query { protected final Term term; public WildcardQuery2(Term term) { this.term=term; } // refactor to MultiTermWeight and share it? protected class WildcardWeight implements Weight { private Searcher searcher; private float queryNorm; private float queryWeight; public WildcardWeight(Searcher searcher) { this.searcher = searcher; } public Query getQuery() { return WildcardQuery2.this; } public float getValue() { return queryWeight; } public float sumOfSquaredWeights() throws IOException { queryWeight = getBoost(); return queryWeight * queryWeight; } public void normalize(float norm) { this.queryNorm = norm; queryWeight *= this.queryNorm; } public Scorer scorer(IndexReader reader) throws IOException { // could analyze the number of terms at this point and only // use the Big scorer if the number of terms are high. BigMultiTermScorer scorer = new BigMultiTermScorer(getSimilarity(searcher), reader); scorer.add(new WildcardTermEnum(reader, term), getSimilarity(searcher), this, reader.norms(term.field()), true, true ); scorer.done(); return scorer; } public Explanation explain(IndexReader reader, int doc) throws IOException { return new Explanation(1.0f, "WildcardQuery2 dummy explain"); } } public String toString(String field) { return "WildCardQuery2("+term+")"; } } /** BigMultiTermScorer should be used when the number of terms in an expanded query is larger * than the maximum number of clauses in a boolean query. * * @author yonik * @version $Id$ */ class BigMultiTermScorer extends Scorer{ private final IndexReader reader; private final float[] normDecoder; private final byte[] scores; private final BitSet docs; private int pos=-1; // It may be desirable to share one score[] across multiple clauses // of a query to save memory... say in the case of // QUERY = title:foo* OR subject:foo* // QUERY = foo* OR bar* // Right now, this can be done by instantiating a single scorer and // calling add() multiple times. An alternate way could be to pass // in the score[] in the constructor, and share across multiple Scorer // instances. This might be needed to optimize "foo* AND bar*" since // that requires two scorers. // Alternate pattern: create a ScoreAccumulator class that could // be shared with multiple scorers. That's pretty much what MatchManyScorer // is anyway though. public BigMultiTermScorer(Similarity similarity, IndexReader reader) throws IOException { super(similarity); this.reader = reader; int maxDoc = reader.maxDoc(); scores = new byte[maxDoc]; docs = new BitSet(maxDoc); normDecoder = Similarity.getNormDecoder(); } // notes: similarity, weight, norms are passed separately for each add() // to enable sharing of this scorer for multiple clauses of a query. public void add(TermEnum terms, Similarity similarity, Weight w, byte[] norms, boolean include_idf, boolean include_tf) throws IOException { float weightVal = w.getValue(); int maxDoc = reader.maxDoc(); TermDocs tdocs = reader.termDocs(); while (terms.next()) { tdocs.seek(terms); float termScore = weightVal; if (include_idf) { termScore *= similarity.idf(terms.docFreq(),maxDoc); } add(tdocs, similarity, termScore, norms, include_tf); } } /** * * @param tdocs * @param similarity * @param termScore all components of the score that are not document specific. (weight,idf are not document specific, tf,norm are) * @param norms * @param include_tf * @throws IOException */ public void add(TermDocs tdocs, Similarity similarity, float termScore, byte[] norms, boolean include_tf) throws IOException { while (tdocs.next()) { int doc = tdocs.doc(); float subscore = termScore; if (include_tf) subscore *= similarity.tf(tdocs.freq()); if (norms!=null) subscore *= normDecoder[norms[doc&0xff]]; add(doc,subscore); } } public void add(int doc, float score) { float curr = SmallFloat.byte52ToFloat(scores[doc]); scores[doc] = SmallFloat.floatToByte52(curr+score); docs.set(doc); } /** done should be called after all calls to add() and before the * first call to next(). */ public void done() { // done() isn't really needed in the current implementation, but // it may be needed in an alternate implementation. pos=-1; } public boolean next() throws IOException { pos = docs.nextSetBit(pos+1); return pos>=0; } public int doc() { return pos; } public float score() throws IOException { return SmallFloat.byte52ToFloat(scores[pos]); } public boolean skipTo(int target) throws IOException { pos=target-1; return next(); } public Explanation explain(int doc) throws IOException { return null; } } --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]