On Sun, Mar 24, 2013 at 10:46 AM, Wei Wang <welshw...@gmail.com> wrote:
Hi, > For example, assume we have fields F1 and F2, we would like to find > all documents with condition F1+F2 > 5.0. This filter may be combined > with other filters to form a BooleanFilter. > > The question is, is there any way to construct an efficient filter to do this? I don't know - but the API looked interesting, so I gave it a try (see below). I had never worked with search filters before writing that code, so please proceed with caution, as I am not sure of many things (iteration of all documents, treatment of deleted documents, what is that "acceptDocs" variable, what threading constraints to respect...). --- // add your package declaration import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import java.util.TreeSet; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache.Ints; import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.Version; import org.junit.Before; import org.junit.Test; public class FilterTest { private static final Version VERSION = Version.LUCENE_42; private static final String FIELD_ID = "id"; private static final String FIELD_ALPHA = "alpha"; private static final String FIELD_OMEGA = "omega"; private static final int SUM_THRESHOLD = 5; private static final int[] VALUES_ALPHA = new int[] { 1, 2, 3, 4, 5 }; private static final int[] VALUES_OMEGA = new int[] { 5, 0, 5, 0, 5 }; private static final Set<Integer> EXPECTED_MATCHED_DOCUMENT_IDS = new HashSet<Integer>(Arrays.asList(0, 2, 4)); private Directory directory; @Before public void setUp() throws IOException { directory = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(VERSION); IndexWriterConfig config = new IndexWriterConfig(VERSION, analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(directory, config); for (int ii = 0; ii < VALUES_ALPHA.length; ii++) { Document doc = new Document(); Field id = new IntField(FIELD_ID, ii, IntField.Store.YES); Field alpha = new IntField(FIELD_ALPHA, VALUES_ALPHA[ii], IntField.Store.YES); Field omega = new IntField(FIELD_OMEGA, VALUES_OMEGA[ii], IntField.Store.YES); doc.add(id); doc.add(alpha); doc.add(omega); writer.addDocument(doc); } writer.close(); } @Test public void testSumFilter() throws IOException { IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); TopDocs results = searcher.search(new MatchAllDocsQuery(), new SumFilter(SUM_THRESHOLD), VALUES_ALPHA.length); try { assertEquals(EXPECTED_MATCHED_DOCUMENT_IDS.size(), results.totalHits); for (int ii = 0; ii < results.scoreDocs.length; ii++) { int docId = results.scoreDocs[ii].doc; Document doc = reader.document(docId); int idValue = doc.getField(FIELD_ID).numericValue().intValue(); int alphaValue = doc.getField(FIELD_ALPHA).numericValue().intValue(); int omegaValue = doc.getField(FIELD_OMEGA).numericValue().intValue(); assertTrue(EXPECTED_MATCHED_DOCUMENT_IDS.contains(idValue)); assertTrue(alphaValue + omegaValue > SUM_THRESHOLD); } } finally { reader.close(); } } private class SumFilter extends Filter { private int minValue; public SumFilter(int minValue) { this.minValue = minValue; } @Override public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { AtomicReader reader = context.reader(); Ints alphaCache = FieldCache.DEFAULT.getInts(reader, FIELD_ALPHA, false); Ints omegaCache = FieldCache.DEFAULT.getInts(reader, FIELD_OMEGA, false); SimpleDocIdSet docIdSet = new SimpleDocIdSet(); int maxDoc = reader.maxDoc(); for (int docId = 0; docId < maxDoc; docId++) { int sum = alphaCache.get(docId) + omegaCache.get(docId); if (sum > minValue) { docIdSet.add(docId); } } return docIdSet; } } private class SimpleDocIdSet extends DocIdSet { private final TreeSet<Integer> sortedDocIdSet = new TreeSet<Integer>(); public void add(int docId) { sortedDocIdSet.add(docId); } @Override public DocIdSetIterator iterator() throws IOException { return new DocIdSetIterator() { private Iterator<Integer> sortedDocIdSetIterator = sortedDocIdSet.iterator(); private int currentDocId = -1; @Override public int advance(int target) throws IOException { while ((currentDocId = nextDoc()) < target) { } return currentDocId; } @Override public int docID() { if (currentDocId == -1) { return -1; } if (!sortedDocIdSetIterator.hasNext()) { return NO_MORE_DOCS; } return currentDocId; } @Override public int nextDoc() throws IOException { if (!sortedDocIdSetIterator.hasNext()) { return NO_MORE_DOCS; } currentDocId = sortedDocIdSetIterator.next(); return currentDocId; } }; } } } --- Regards, Yep. --------------------------------------------------------------------- To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org For additional commands, e-mail: java-user-h...@lucene.apache.org