Woops, thanks Rob! Mike McCandless
http://blog.mikemccandless.com On Thu, Apr 2, 2015 at 11:17 AM, Robert Muir <[email protected]> wrote: > NOTE: i hit compile failure like this (TermRangeTermsEnum got removed). > > I am going to remove these asserts: to me they don't look very useful, > and fix the build for now. > > compile-test: > [mkdir] Created dir: > /home/rmuir/workspace/trunk/lucene/build/core/classes/test > [javac] Compiling 431 source files to > /home/rmuir/workspace/trunk/lucene/build/core/classes/test > [javac] > /home/rmuir/workspace/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java:123: > error: cannot find symbol > [javac] assertFalse(query.getTermsEnum(terms) instanceof > TermRangeTermsEnum); > [javac] ^ > [javac] symbol: class TermRangeTermsEnum > [javac] location: class TestTermRangeQuery > [javac] > /home/rmuir/workspace/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java:128: > error: cannot find symbol > [javac] assertFalse(query.getTermsEnum(terms) instanceof > TermRangeTermsEnum); > [javac] ^ > [javac] symbol: class TermRangeTermsEnum > [javac] location: class TestTermRangeQuery > [javac] > /home/rmuir/workspace/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java:132: > error: cannot find symbol > [javac] assertFalse(query.getTermsEnum(terms) instanceof > TermRangeTermsEnum); > [javac] ^ > [javac] symbol: class TermRangeTermsEnum > [javac] location: class TestTermRangeQuery > [javac] Note: Some input files use or override a deprecated API. > [javac] Note: Recompile with -Xlint:deprecation for details. > [javac] 3 errors > > On Thu, Apr 2, 2015 at 11:05 AM, <[email protected]> wrote: >> Author: mikemccand >> Date: Thu Apr 2 15:05:48 2015 >> New Revision: 1670918 >> >> URL: http://svn.apache.org/r1670918 >> Log: >> LUCENE-5879: add auto-prefix terms to block tree, and experimental >> AutoPrefixTermsPostingsFormat >> >> Added: >> >> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/ >> >> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/AutoPrefixPostingsFormat.java >> (with props) >> >> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/package-info.java >> (with props) >> >> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/ >> >> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixPostingsFormat.java >> (with props) >> >> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixTerms.java >> (with props) >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java >> (with props) >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetPostingsEnum.java >> (with props) >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetTermsEnum.java >> (with props) >> >> lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java >> (with props) >> Removed: >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/TermRangeTermsEnum.java >> Modified: >> lucene/dev/trunk/lucene/CHANGES.txt >> >> lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnum.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TermContext.java >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/Terms.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/TermRangeQuery.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java >> >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java >> >> lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java >> >> lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java >> >> lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java >> >> lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java >> >> lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java >> >> lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java >> >> lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/AssertingLeafReader.java >> >> lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java >> >> lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java >> >> Modified: lucene/dev/trunk/lucene/CHANGES.txt >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1670918&r1=1670917&r2=1670918&view=diff >> ============================================================================== >> --- lucene/dev/trunk/lucene/CHANGES.txt (original) >> +++ lucene/dev/trunk/lucene/CHANGES.txt Thu Apr 2 15:05:48 2015 >> @@ -19,6 +19,10 @@ New Features >> for counting ranges that align with the underlying terms as defined by the >> NumberRangePrefixTree (e.g. familiar date units like days). (David >> Smiley) >> >> +* LUCENE-5879: Added experimental auto-prefix terms to BlockTree terms >> + dictionary, exposed as AutoPrefixPostingsFormat (Adrien Grand, >> + Uwe Schindler, Robert Muir, Mike McCandless) >> + >> API Changes >> >> * LUCENE-3312: The API of oal.document was restructured to >> >> Added: >> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/AutoPrefixPostingsFormat.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/AutoPrefixPostingsFormat.java?rev=1670918&view=auto >> ============================================================================== >> --- >> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/AutoPrefixPostingsFormat.java >> (added) >> +++ >> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/AutoPrefixPostingsFormat.java >> Thu Apr 2 15:05:48 2015 >> @@ -0,0 +1,125 @@ >> +package org.apache.lucene.codecs.autoprefix; >> + >> +/* >> + * Licensed to the Apache Software Foundation (ASF) under one or more >> + * contributor license agreements. See the NOTICE file distributed with >> + * this work for additional information regarding copyright ownership. >> + * The ASF licenses this file to You under the Apache License, Version 2.0 >> + * (the "License"); you may not use this file except in compliance with >> + * the License. You may obtain a copy of the License at >> + * >> + * http://www.apache.org/licenses/LICENSE-2.0 >> + * >> + * Unless required by applicable law or agreed to in writing, software >> + * distributed under the License is distributed on an "AS IS" BASIS, >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> + * See the License for the specific language governing permissions and >> + * limitations under the License. >> + */ >> + >> +import java.io.IOException; >> + >> +import org.apache.lucene.codecs.FieldsConsumer; >> +import org.apache.lucene.codecs.FieldsProducer; >> +import org.apache.lucene.codecs.PostingsFormat; >> +import org.apache.lucene.codecs.PostingsReaderBase; >> +import org.apache.lucene.codecs.PostingsWriterBase; >> +import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader; >> +import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter; >> +import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat; >> +import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader; >> +import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter; >> +import org.apache.lucene.index.SegmentReadState; >> +import org.apache.lucene.index.SegmentWriteState; >> +import org.apache.lucene.util.IOUtils; >> + >> +/** >> + * Just like {@link Lucene50PostingsFormat} except this format >> + * exposes the experimental auto-prefix terms. >> + * >> + * @lucene.experimental >> + */ >> + >> +public final class AutoPrefixPostingsFormat extends PostingsFormat { >> + >> + private final int minItemsInBlock; >> + private final int maxItemsInBlock; >> + private final int minItemsInAutoPrefix; >> + private final int maxItemsInAutoPrefix; >> + >> + /** Creates {@code AutoPrefixPostingsFormat} with default settings. */ >> + public AutoPrefixPostingsFormat() { >> + this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, >> + BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE, >> + 25, 48); >> + } >> + >> + /** Creates {@code Lucene50PostingsFormat} with custom >> + * values for {@code minBlockSize} and {@code >> + * maxBlockSize} passed to block terms dictionary. >> + * @see >> BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) >> */ >> + public AutoPrefixPostingsFormat(int minItemsInAutoPrefix, int >> maxItemsInAutoPrefix) { >> + this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, >> + BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE, >> + minItemsInAutoPrefix, >> + maxItemsInAutoPrefix); >> + } >> + >> + /** Creates {@code Lucene50PostingsFormat} with custom >> + * values for {@code minBlockSize}, {@code >> + * maxBlockSize}, {@code minItemsInAutoPrefix} and {@code >> maxItemsInAutoPrefix}, passed >> + * to block tree terms dictionary. >> + * @see >> BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int,int,int) >> */ >> + public AutoPrefixPostingsFormat(int minItemsInBlock, int maxItemsInBlock, >> int minItemsInAutoPrefix, int maxItemsInAutoPrefix) { >> + super("AutoPrefix"); >> + BlockTreeTermsWriter.validateSettings(minItemsInBlock, >> + maxItemsInBlock); >> + BlockTreeTermsWriter.validateAutoPrefixSettings(minItemsInAutoPrefix, >> + maxItemsInAutoPrefix); >> + this.minItemsInBlock = minItemsInBlock; >> + this.maxItemsInBlock = maxItemsInBlock; >> + this.minItemsInAutoPrefix = minItemsInAutoPrefix; >> + this.maxItemsInAutoPrefix = maxItemsInAutoPrefix; >> + } >> + >> + @Override >> + public String toString() { >> + return getName(); >> + } >> + >> + @Override >> + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws >> IOException { >> + PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state); >> + >> + boolean success = false; >> + try { >> + FieldsConsumer ret = new BlockTreeTermsWriter(state, >> + postingsWriter, >> + minItemsInBlock, >> + maxItemsInBlock, >> + minItemsInAutoPrefix, >> + maxItemsInAutoPrefix); >> + success = true; >> + return ret; >> + } finally { >> + if (!success) { >> + IOUtils.closeWhileHandlingException(postingsWriter); >> + } >> + } >> + } >> + >> + @Override >> + public FieldsProducer fieldsProducer(SegmentReadState state) throws >> IOException { >> + PostingsReaderBase postingsReader = new Lucene50PostingsReader(state); >> + boolean success = false; >> + try { >> + FieldsProducer ret = new BlockTreeTermsReader(postingsReader, state); >> + success = true; >> + return ret; >> + } finally { >> + if (!success) { >> + IOUtils.closeWhileHandlingException(postingsReader); >> + } >> + } >> + } >> +} >> >> Added: >> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/package-info.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/package-info.java?rev=1670918&view=auto >> ============================================================================== >> --- >> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/package-info.java >> (added) >> +++ >> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/package-info.java >> Thu Apr 2 15:05:48 2015 >> @@ -0,0 +1,22 @@ >> +/* >> + * Licensed to the Apache Software Foundation (ASF) under one or more >> + * contributor license agreements. See the NOTICE file distributed with >> + * this work for additional information regarding copyright ownership. >> + * The ASF licenses this file to You under the Apache License, Version 2.0 >> + * (the "License"); you may not use this file except in compliance with >> + * the License. You may obtain a copy of the License at >> + * >> + * http://www.apache.org/licenses/LICENSE-2.0 >> + * >> + * Unless required by applicable law or agreed to in writing, software >> + * distributed under the License is distributed on an "AS IS" BASIS, >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> + * See the License for the specific language governing permissions and >> + * limitations under the License. >> + */ >> + >> +/** >> + * An experimental postings format that automatically indexes appropriate >> + * prefix terms for fast range and prefix queries. >> + */ >> +package org.apache.lucene.codecs.autoprefix; >> >> Modified: >> lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat?rev=1670918&r1=1670917&r2=1670918&view=diff >> ============================================================================== >> --- >> lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat >> (original) >> +++ >> lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat >> Thu Apr 2 15:05:48 2015 >> @@ -20,3 +20,4 @@ org.apache.lucene.codecs.memory.FSTOrdPo >> org.apache.lucene.codecs.memory.FSTPostingsFormat >> org.apache.lucene.codecs.memory.MemoryPostingsFormat >> org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat >> +org.apache.lucene.codecs.autoprefix.AutoPrefixPostingsFormat >> >> Added: >> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixPostingsFormat.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixPostingsFormat.java?rev=1670918&view=auto >> ============================================================================== >> --- >> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixPostingsFormat.java >> (added) >> +++ >> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixPostingsFormat.java >> Thu Apr 2 15:05:48 2015 >> @@ -0,0 +1,38 @@ >> +package org.apache.lucene.codecs.autoprefix; >> + >> +/* >> + * Licensed to the Apache Software Foundation (ASF) under one or more >> + * contributor license agreements. See the NOTICE file distributed with >> + * this work for additional information regarding copyright ownership. >> + * The ASF licenses this file to You under the Apache License, Version 2.0 >> + * (the "License"); you may not use this file except in compliance with >> + * the License. You may obtain a copy of the License at >> + * >> + * http://www.apache.org/licenses/LICENSE-2.0 >> + * >> + * Unless required by applicable law or agreed to in writing, software >> + * distributed under the License is distributed on an "AS IS" BASIS, >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> + * See the License for the specific language governing permissions and >> + * limitations under the License. >> + */ >> + >> +import org.apache.lucene.index.IndexOptions; >> +import org.apache.lucene.index.RandomPostingsTester; >> +import org.apache.lucene.util.LuceneTestCase; >> +import org.apache.lucene.util.TestUtil; >> + >> +/** >> + * Tests AutoPrefix's postings >> + */ >> + >> +// NOTE: we don't extend BasePostingsFormatTestCase becase we can only >> handle DOCS_ONLY fields: >> + >> +public class TestAutoPrefixPostingsFormat extends LuceneTestCase { >> + public void test() throws Exception { >> + new >> RandomPostingsTester(random()).testFull(TestUtil.alwaysPostingsFormat(new >> AutoPrefixPostingsFormat()), >> + createTempDir("autoprefix"), >> + IndexOptions.DOCS, >> + false); >> + } >> +} >> >> Added: >> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixTerms.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixTerms.java?rev=1670918&view=auto >> ============================================================================== >> --- >> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixTerms.java >> (added) >> +++ >> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixTerms.java >> Thu Apr 2 15:05:48 2015 >> @@ -0,0 +1,738 @@ >> +package org.apache.lucene.codecs.autoprefix; >> + >> +/* >> + * Licensed to the Apache Software Foundation (ASF) under one or more >> + * contributor license agreements. See the NOTICE file distributed with >> + * this work for additional information regarding copyright ownership. >> + * The ASF licenses this file to You under the Apache License, Version 2.0 >> + * (the "License"); you may not use this file except in compliance with >> + * the License. You may obtain a copy of the License at >> + * >> + * http://www.apache.org/licenses/LICENSE-2.0 >> + * >> + * Unless required by applicable law or agreed to in writing, software >> + * distributed under the License is distributed on an "AS IS" BASIS, >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> + * See the License for the specific language governing permissions and >> + * limitations under the License. >> + */ >> + >> +import java.util.ArrayList; >> +import java.util.Collections; >> +import java.util.HashMap; >> +import java.util.HashSet; >> +import java.util.List; >> +import java.util.Locale; >> +import java.util.Map; >> +import java.util.Set; >> + >> +import org.apache.lucene.analysis.MockAnalyzer; >> +import org.apache.lucene.analysis.TokenStream; >> +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; >> +import org.apache.lucene.codecs.Codec; >> +import org.apache.lucene.document.BinaryDocValuesField; >> +import org.apache.lucene.document.Document; >> +import org.apache.lucene.document.Field; >> +import org.apache.lucene.document.FieldType; >> +import org.apache.lucene.document.NumericDocValuesField; >> +import org.apache.lucene.document.StringField; >> +import org.apache.lucene.index.BinaryDocValues; >> +import org.apache.lucene.index.DirectoryReader; >> +import org.apache.lucene.index.IndexOptions; >> +import org.apache.lucene.index.IndexReader; >> +import org.apache.lucene.index.IndexWriter; >> +import org.apache.lucene.index.IndexWriterConfig; >> +import org.apache.lucene.index.MultiDocValues; >> +import org.apache.lucene.index.MultiFields; >> +import org.apache.lucene.index.NumericDocValues; >> +import org.apache.lucene.index.PostingsEnum; >> +import org.apache.lucene.index.SerialMergeScheduler; >> +import org.apache.lucene.index.Term; >> +import org.apache.lucene.index.Terms; >> +import org.apache.lucene.index.TermsEnum; >> +import org.apache.lucene.search.MultiTermQuery; >> +import org.apache.lucene.search.PrefixQuery; >> +import org.apache.lucene.store.Directory; >> +import org.apache.lucene.util.AttributeImpl; >> +import org.apache.lucene.util.BytesRef; >> +import org.apache.lucene.util.FixedBitSet; >> +import org.apache.lucene.util.LuceneTestCase; >> +import org.apache.lucene.util.StringHelper; >> +import org.apache.lucene.util.TestUtil; >> +import org.apache.lucene.util.automaton.Automata; >> +import org.apache.lucene.util.automaton.CompiledAutomaton; >> + >> +public class TestAutoPrefixTerms extends LuceneTestCase { >> + >> + private int minItemsPerBlock = TestUtil.nextInt(random(), 2, 100); >> + private int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + >> random().nextInt(100); >> + private int minTermsAutoPrefix = TestUtil.nextInt(random(), 2, 100); >> + private int maxTermsAutoPrefix = random().nextBoolean() ? Math.max(2, >> (minTermsAutoPrefix-1)*2 + random().nextInt(100)) : Integer.MAX_VALUE; >> + >> + private final Codec codec = TestUtil.alwaysPostingsFormat(new >> AutoPrefixPostingsFormat(minItemsPerBlock, maxItemsPerBlock, >> + >> minTermsAutoPrefix, maxTermsAutoPrefix)); >> + >> + // Numbers in a restricted range, encoded in decimal, left-0-padded: >> + public void testBasicNumericRanges() throws Exception { >> + Directory dir = newDirectory(); >> + IndexWriterConfig iwc = new IndexWriterConfig(new >> MockAnalyzer(random())); >> + iwc.setCodec(codec); >> + IndexWriter w = new IndexWriter(dir, iwc); >> + int numTerms = TestUtil.nextInt(random(), 3000, 50000); >> + Set<String> terms = new HashSet<>(); >> + int digits = TestUtil.nextInt(random(), 5, 10); >> + int maxValue = 1; >> + for(int i=0;i<digits;i++) { >> + maxValue *= 10; >> + } >> + String format = "%0" + digits + "d"; >> + while (terms.size() < numTerms) { >> + terms.add(String.format(Locale.ROOT, format, >> random().nextInt(maxValue))); >> + } >> + >> + for(String term : terms) { >> + Document doc = new Document(); >> + doc.add(new StringField("field", term, Field.Store.NO)); >> + doc.add(new NumericDocValuesField("field", Long.parseLong(term))); >> + w.addDocument(doc); >> + } >> + >> + if (VERBOSE) System.out.println("\nTEST: now optimize"); >> + if (random().nextBoolean()) { >> + w.forceMerge(1); >> + } >> + >> + if (VERBOSE) System.out.println("\nTEST: now done"); >> + IndexReader r = DirectoryReader.open(w, true); >> + >> + List<String> sortedTerms = new ArrayList<>(terms); >> + Collections.sort(sortedTerms); >> + >> + if (VERBOSE) { >> + System.out.println("TEST: sorted terms:"); >> + int idx = 0; >> + for(String term : sortedTerms) { >> + System.out.println(idx + ": " + term); >> + idx++; >> + } >> + } >> + >> + int iters = atLeast(100); >> + for(int iter=0;iter<iters;iter++) { >> + int min, max; >> + while (true) { >> + min = random().nextInt(maxValue); >> + max = random().nextInt(maxValue); >> + if (min == max) { >> + continue; >> + } else if (min > max) { >> + int x = min; >> + min = max; >> + max = x; >> + } >> + break; >> + } >> + >> + if (VERBOSE) { >> + System.out.println("\nTEST: iter=" + iter + " min=" + min + " max=" >> + max); >> + } >> + >> + boolean minInclusive = random().nextBoolean(); >> + boolean maxInclusive = random().nextBoolean(); >> + BytesRef minTerm = new BytesRef(String.format(Locale.ROOT, format, >> min)); >> + BytesRef maxTerm = new BytesRef(String.format(Locale.ROOT, format, >> max)); >> + CompiledAutomaton ca = new >> CompiledAutomaton(Automata.makeBinaryInterval(minTerm, minInclusive, >> maxTerm, maxInclusive), >> + true, false, >> Integer.MAX_VALUE, true); >> + >> + TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field")); >> + NumericDocValues docValues = MultiDocValues.getNumericValues(r, >> "field"); >> + PostingsEnum postingsEnum = null; >> + >> + VerifyAutoPrefixTerms verifier = new >> VerifyAutoPrefixTerms(r.maxDoc(), minTerm, maxTerm); >> + >> + while (te.next() != null) { >> + if (VERBOSE) { >> + System.out.println(" got term=" + te.term().utf8ToString()); >> + } >> + verifier.sawTerm(te.term()); >> + postingsEnum = te.postings(null, postingsEnum); >> + int docID; >> + while ((docID = postingsEnum.nextDoc()) != >> PostingsEnum.NO_MORE_DOCS) { >> + long v = docValues.get(docID); >> + assert v >= min && v <= max: "docID=" + docID + " v=" + v; >> + // The auto-prefix terms should never "overlap" one another, so >> we should only ever see a given docID one time: >> + if (VERBOSE) { >> + System.out.println(" got docID=" + docID + " v=" + v); >> + } >> + verifier.sawDoc(docID); >> + } >> + } >> + >> + int startLoc = Collections.binarySearch(sortedTerms, >> String.format(Locale.ROOT, format, min)); >> + if (startLoc < 0) { >> + startLoc = -startLoc-1; >> + } else if (minInclusive == false) { >> + startLoc++; >> + } >> + int endLoc = Collections.binarySearch(sortedTerms, >> String.format(Locale.ROOT, format, max)); >> + if (endLoc < 0) { >> + endLoc = -endLoc-2; >> + } else if (maxInclusive == false) { >> + endLoc--; >> + } >> + verifier.finish(endLoc-startLoc+1, maxTermsAutoPrefix); >> + } >> + >> + r.close(); >> + w.close(); >> + dir.close(); >> + } >> + >> + private static BytesRef intToBytes(int v) { >> + int sortableBits = v ^ 0x80000000; >> + BytesRef token = new BytesRef(4); >> + token.length = 4; >> + int index = 3; >> + while (index >= 0) { >> + token.bytes[index] = (byte) (sortableBits & 0xff); >> + index--; >> + sortableBits >>>= 8; >> + } >> + return token; >> + } >> + >> + // Numbers are encoded in full binary (4 byte ints): >> + public void testBinaryNumericRanges() throws Exception { >> + if (VERBOSE) { >> + System.out.println("TEST: minItemsPerBlock=" + minItemsPerBlock); >> + System.out.println("TEST: maxItemsPerBlock=" + maxItemsPerBlock); >> + System.out.println("TEST: minTermsAutoPrefix=" + minTermsAutoPrefix); >> + System.out.println("TEST: maxTermsAutoPrefix=" + maxTermsAutoPrefix); >> + } >> + Directory dir = newDirectory(); >> + IndexWriterConfig iwc = new IndexWriterConfig(new >> MockAnalyzer(random())); >> + iwc.setCodec(codec); >> + IndexWriter w = new IndexWriter(dir, iwc); >> + int numTerms = TestUtil.nextInt(random(), 3000, 50000); >> + Set<Integer> terms = new HashSet<>(); >> + while (terms.size() < numTerms) { >> + terms.add(random().nextInt()); >> + } >> + >> + for(Integer term : terms) { >> + Document doc = new Document(); >> + doc.add(new BinaryField("field", intToBytes(term))); >> + doc.add(new NumericDocValuesField("field", term)); >> + w.addDocument(doc); >> + } >> + >> + if (random().nextBoolean()) { >> + if (VERBOSE) System.out.println("TEST: now force merge"); >> + w.forceMerge(1); >> + } >> + >> + IndexReader r = DirectoryReader.open(w, true); >> + >> + List<Integer> sortedTerms = new ArrayList<>(terms); >> + Collections.sort(sortedTerms); >> + >> + if (VERBOSE) { >> + System.out.println("TEST: sorted terms:"); >> + int idx = 0; >> + for(Integer term : sortedTerms) { >> + System.out.println(idx + ": " + term); >> + idx++; >> + } >> + } >> + >> + int iters = atLeast(100); >> + for(int iter=0;iter<iters;iter++) { >> + >> + int min, max; >> + while (true) { >> + min = random().nextInt(); >> + max = random().nextInt(); >> + if (min == max) { >> + continue; >> + } else if (min > max) { >> + int x = min; >> + min = max; >> + max = x; >> + } >> + break; >> + } >> + >> + if (VERBOSE) { >> + System.out.println("\nTEST: iter=" + iter + " min=" + min + " (" + >> intToBytes(min) + ") max=" + max + " (" + intToBytes(max) + ")"); >> + } >> + >> + boolean minInclusive = random().nextBoolean(); >> + BytesRef minTerm = intToBytes(min); >> + boolean maxInclusive = random().nextBoolean(); >> + BytesRef maxTerm = intToBytes(max); >> + CompiledAutomaton ca = new >> CompiledAutomaton(Automata.makeBinaryInterval(minTerm, minInclusive, >> maxTerm, maxInclusive), >> + true, false, >> Integer.MAX_VALUE, true); >> + >> + TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field")); >> + NumericDocValues docValues = MultiDocValues.getNumericValues(r, >> "field"); >> + PostingsEnum postingsEnum = null; >> + VerifyAutoPrefixTerms verifier = new >> VerifyAutoPrefixTerms(r.maxDoc(), minTerm, maxTerm); >> + while (te.next() != null) { >> + if (VERBOSE) { >> + System.out.println(" got term=" + te.term() + " docFreq=" + >> te.docFreq()); >> + } >> + verifier.sawTerm(te.term()); >> + postingsEnum = te.postings(null, postingsEnum); >> + int docID; >> + while ((docID = postingsEnum.nextDoc()) != >> PostingsEnum.NO_MORE_DOCS) { >> + long v = docValues.get(docID); >> + assert v >= min && v <= max: "docID=" + docID + " v=" + v; >> + verifier.sawDoc(docID); >> + } >> + } >> + >> + int startLoc = Collections.binarySearch(sortedTerms, min); >> + if (startLoc < 0) { >> + startLoc = -startLoc-1; >> + } else if (minInclusive == false) { >> + startLoc++; >> + } >> + int endLoc = Collections.binarySearch(sortedTerms, max); >> + if (endLoc < 0) { >> + endLoc = -endLoc-2; >> + } else if (maxInclusive == false) { >> + endLoc--; >> + } >> + int expectedHits = endLoc-startLoc+1; >> + try { >> + verifier.finish(expectedHits, maxTermsAutoPrefix); >> + } catch (AssertionError ae) { >> + for(int i=0;i<numTerms;i++) { >> + if (verifier.allHits.get(i) == false) { >> + int v = (int) docValues.get(i); >> + boolean accept = (v > min || (v == min && minInclusive)) && >> + (v < max || (v == max && maxInclusive)); >> + if (accept) { >> + System.out.println("MISSING: docID=" + i + " v=" + v + " >> term=" + intToBytes(v)); >> + } >> + } >> + } >> + >> + throw ae; >> + } >> + } >> + >> + r.close(); >> + w.close(); >> + dir.close(); >> + } >> + >> + // Non-numeric, simple prefix query >> + public void testBasicPrefixTerms() throws Exception { >> + Directory dir = newDirectory(); >> + IndexWriterConfig iwc = new IndexWriterConfig(new >> MockAnalyzer(random())); >> + iwc.setCodec(codec); >> + iwc.setMergeScheduler(new SerialMergeScheduler()); >> + IndexWriter w = new IndexWriter(dir, iwc); >> + int numTerms = TestUtil.nextInt(random(), 3000, 50000); >> + Set<String> terms = new HashSet<>(); >> + while (terms.size() < numTerms) { >> + terms.add(TestUtil.randomSimpleString(random())); >> + } >> + >> + for(String term : terms) { >> + Document doc = new Document(); >> + doc.add(new StringField("field", term, Field.Store.NO)); >> + doc.add(new BinaryDocValuesField("field", new BytesRef(term))); >> + w.addDocument(doc); >> + } >> + >> + if (random().nextBoolean()) { >> + if (VERBOSE) { >> + System.out.println("TEST: now force merge"); >> + } >> + w.forceMerge(1); >> + } >> + >> + IndexReader r = DirectoryReader.open(w, true); >> + >> + List<String> sortedTerms = new ArrayList<>(terms); >> + Collections.sort(sortedTerms); >> + >> + if (VERBOSE) { >> + System.out.println("TEST: sorted terms:"); >> + int idx = 0; >> + for(String term : sortedTerms) { >> + System.out.println(idx + ": " + term); >> + idx++; >> + } >> + } >> + >> + if (VERBOSE) { >> + System.out.println("TEST: r=" + r); >> + } >> + >> + int iters = atLeast(100); >> + for(int iter=0;iter<iters;iter++) { >> + if (VERBOSE) { >> + System.out.println("\nTEST: iter=" + iter); >> + } >> + >> + String prefix; >> + if (random().nextInt(100) == 42) { >> + prefix = ""; >> + } else { >> + prefix = TestUtil.randomSimpleString(random(), 1, 4); >> + } >> + BytesRef prefixBR = new BytesRef(prefix); >> + if (VERBOSE) { >> + System.out.println(" prefix=" + prefix); >> + } >> + >> + CompiledAutomaton ca = new >> CompiledAutomaton(PrefixQuery.toAutomaton(prefixBR), true, false, >> Integer.MAX_VALUE, true); >> + TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field")); >> + BinaryDocValues docValues = MultiDocValues.getBinaryValues(r, >> "field"); >> + PostingsEnum postingsEnum = null; >> + >> + VerifyAutoPrefixTerms verifier = new >> VerifyAutoPrefixTerms(r.maxDoc(), prefixBR); >> + >> + while (te.next() != null) { >> + if (VERBOSE) { >> + System.out.println("TEST: got term=" + te.term().utf8ToString() + >> " docFreq=" + te.docFreq()); >> + } >> + verifier.sawTerm(te.term()); >> + postingsEnum = te.postings(null, postingsEnum); >> + int docID; >> + while ((docID = postingsEnum.nextDoc()) != >> PostingsEnum.NO_MORE_DOCS) { >> + assertTrue("prefixBR=" + prefixBR + " docBR=" + >> docValues.get(docID), StringHelper.startsWith(docValues.get(docID), >> prefixBR)); >> + // The auto-prefix terms should never "overlap" one another, so >> we should only ever see a given docID one time: >> + verifier.sawDoc(docID); >> + } >> + } >> + >> + int startLoc = Collections.binarySearch(sortedTerms, prefix); >> + if (startLoc < 0) { >> + startLoc = -startLoc-1; >> + } >> + int endLoc = Collections.binarySearch(sortedTerms, prefix + (char) >> ('z'+1)); >> + if (endLoc < 0) { >> + endLoc = -endLoc-2; >> + } >> + int expectedHits = endLoc-startLoc+1; >> + try { >> + verifier.finish(expectedHits, maxTermsAutoPrefix); >> + } catch (AssertionError ae) { >> + for(int i=0;i<numTerms;i++) { >> + if (verifier.allHits.get(i) == false) { >> + String s = docValues.get(i).utf8ToString(); >> + if (s.startsWith(prefix)) { >> + System.out.println("MISSING: docID=" + i + " term=" + s); >> + } >> + } >> + } >> + >> + throw ae; >> + } >> + } >> + >> + r.close(); >> + w.close(); >> + dir.close(); >> + } >> + >> + public void testDemoPrefixTerms() throws Exception { >> + if (VERBOSE) { >> + System.out.println("\nTEST: minTermsAutoPrefix=" + minTermsAutoPrefix >> + " maxTermsAutoPrefix=" + maxTermsAutoPrefix); >> + System.out.println("\nTEST: minItemsPerBlock=" + minItemsPerBlock + " >> maxItemsPerBlock=" + maxItemsPerBlock); >> + } >> + Directory dir = newDirectory(); >> + IndexWriterConfig iwc = new IndexWriterConfig(new >> MockAnalyzer(random())); >> + iwc.setCodec(codec); >> + IndexWriter w = new IndexWriter(dir, iwc); >> + int numDocs = 30; >> + >> + for(int i=0;i<numDocs;i++) { >> + Document doc = new Document(); >> + doc.add(new StringField("field", "" + (char) (97+i), Field.Store.NO)); >> + w.addDocument(doc); >> + doc = new Document(); >> + doc.add(new StringField("field", "a" + (char) (97+i), >> Field.Store.NO)); >> + w.addDocument(doc); >> + } >> + >> + if (random().nextBoolean()) { >> + w.forceMerge(1); >> + } >> + >> + IndexReader r = DirectoryReader.open(w, true); >> + Terms terms = MultiFields.getTerms(r, "field"); >> + if (VERBOSE) { >> + System.out.println("\nTEST: now intersect"); >> + } >> + CompiledAutomaton ca = new >> CompiledAutomaton(PrefixQuery.toAutomaton(new BytesRef("a")), false, false, >> Integer.MAX_VALUE, true); >> + TermsEnum te = ca.getTermsEnum(terms); >> + PostingsEnum postingsEnum = null; >> + >> + VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(), >> new BytesRef("a")); >> + //TermsEnum te = terms.intersect(new CompiledAutomaton(a, true, false), >> null); >> + while (te.next() != null) { >> + verifier.sawTerm(te.term()); >> + postingsEnum = te.postings(null, postingsEnum); >> + int docID; >> + while ((docID = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) >> { >> + // The auto-prefix terms should never "overlap" one another, so we >> should only ever see a given docID one time: >> + verifier.sawDoc(docID); >> + } >> + } >> + // 1 document has exactly "a", and 30 documents had "a?" >> + verifier.finish(31, maxTermsAutoPrefix); >> + PrefixQuery q = new PrefixQuery(new Term("field", "a")); >> + q.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE); >> + assertEquals(31, newSearcher(r).search(q, 1).totalHits); >> + r.close(); >> + w.close(); >> + dir.close(); >> + } >> + >> + static final class BinaryTokenStream extends TokenStream { >> + private final ByteTermAttribute bytesAtt = >> addAttribute(ByteTermAttribute.class); >> + private boolean available = true; >> + >> + public BinaryTokenStream(BytesRef bytes) { >> + bytesAtt.setBytesRef(bytes); >> + } >> + >> + @Override >> + public boolean incrementToken() { >> + if (available) { >> + clearAttributes(); >> + available = false; >> + return true; >> + } >> + return false; >> + } >> + >> + @Override >> + public void reset() { >> + available = true; >> + } >> + >> + public interface ByteTermAttribute extends TermToBytesRefAttribute { >> + void setBytesRef(BytesRef bytes); >> + } >> + >> + public static class ByteTermAttributeImpl extends AttributeImpl >> implements ByteTermAttribute,TermToBytesRefAttribute { >> + private BytesRef bytes; >> + >> + @Override >> + public void fillBytesRef() { >> + // no-op: the bytes was already filled by our owner's incrementToken >> + } >> + >> + @Override >> + public BytesRef getBytesRef() { >> + return bytes; >> + } >> + >> + @Override >> + public void setBytesRef(BytesRef bytes) { >> + this.bytes = bytes; >> + } >> + >> + @Override >> + public void clear() {} >> + >> + @Override >> + public void copyTo(AttributeImpl target) { >> + ByteTermAttributeImpl other = (ByteTermAttributeImpl) target; >> + other.bytes = bytes; >> + } >> + } >> + } >> + >> + /** Basically a StringField that accepts binary term. */ >> + private static class BinaryField extends Field { >> + >> + final static FieldType TYPE; >> + static { >> + TYPE = new FieldType(StringField.TYPE_NOT_STORED); >> + // Necessary so our custom tokenStream is used by Field.tokenStream: >> + TYPE.setTokenized(true); >> + TYPE.freeze(); >> + } >> + >> + public BinaryField(String name, BytesRef value) { >> + super(name, new BinaryTokenStream(value), TYPE); >> + } >> + } >> + >> + /** Helper class to ensure auto-prefix terms 1) never overlap one >> another, and 2) are used when they should be. */ >> + private static class VerifyAutoPrefixTerms { >> + final FixedBitSet allHits; >> + private final Map<BytesRef,Integer> prefixCounts = new HashMap<>(); >> + private int totPrefixCount; >> + private final BytesRef[] bounds; >> + private int totTermCount; >> + private BytesRef lastTerm; >> + >> + public VerifyAutoPrefixTerms(int maxDoc, BytesRef... bounds) { >> + allHits = new FixedBitSet(maxDoc); >> + assert bounds.length > 0; >> + this.bounds = bounds; >> + } >> + >> + public void sawTerm(BytesRef term) { >> + //System.out.println("saw term=" + term); >> + if (lastTerm != null) { >> + assertTrue(lastTerm.compareTo(term) < 0); >> + } >> + lastTerm = BytesRef.deepCopyOf(term); >> + totTermCount++; >> + totPrefixCount += term.length; >> + for(int i=1;i<=term.length;i++) { >> + BytesRef prefix = BytesRef.deepCopyOf(term); >> + prefix.length = i; >> + Integer count = prefixCounts.get(prefix); >> + if (count == null) { >> + count = 1; >> + } else { >> + count += 1; >> + } >> + prefixCounts.put(prefix, count); >> + } >> + } >> + >> + public void sawDoc(int docID) { >> + // The auto-prefix terms should never "overlap" one another, so we >> should only ever see a given docID one time: >> + assertFalse(allHits.getAndSet(docID)); >> + } >> + >> + public void finish(int expectedNumHits, int maxPrefixCount) { >> + >> + if (maxPrefixCount != -1) { >> + // Auto-terms were used in this test >> + long allowedMaxTerms; >> + >> + if (bounds.length == 1) { >> + // Simple prefix query: we should never see more than >> maxPrefixCount terms: >> + allowedMaxTerms = maxPrefixCount; >> + } else { >> + // Trickier: we need to allow for maxPrefixTerms for each >> different leading byte in the min and max: >> + assert bounds.length == 2; >> + BytesRef minTerm = bounds[0]; >> + BytesRef maxTerm = bounds[1]; >> + >> + int commonPrefix = 0; >> + for(int i=0;i<minTerm.length && i<maxTerm.length;i++) { >> + if (minTerm.bytes[minTerm.offset+i] != >> maxTerm.bytes[maxTerm.offset+i]) { >> + commonPrefix = i; >> + break; >> + } >> + } >> + >> + allowedMaxTerms = maxPrefixCount * (long) >> ((minTerm.length-commonPrefix) + (maxTerm.length-commonPrefix)); >> + } >> + >> + assertTrue("totTermCount=" + totTermCount + " is > >> allowedMaxTerms=" + allowedMaxTerms, totTermCount <= allowedMaxTerms); >> + } >> + >> + assertEquals(expectedNumHits, allHits.cardinality()); >> + int sum = 0; >> + for(Map.Entry<BytesRef,Integer> ent : prefixCounts.entrySet()) { >> + >> + BytesRef prefix = ent.getKey(); >> + if (VERBOSE) { >> + System.out.println(" verify prefix=" + >> TestUtil.bytesRefToString(prefix) + " count=" + ent.getValue()); >> + } >> + >> + if (maxPrefixCount != -1) { >> + // Auto-terms were used in this test >> + >> + int sumLeftoverSuffix = 0; >> + for(BytesRef bound : bounds) { >> + >> + int minSharedLength = Math.min(bound.length, prefix.length); >> + int commonPrefix = minSharedLength; >> + for(int i=0;i<minSharedLength;i++) { >> + if (bound.bytes[bound.offset+i] != >> prefix.bytes[prefix.offset+i]) { >> + commonPrefix = i; >> + break; >> + } >> + } >> + sumLeftoverSuffix += bound.length - commonPrefix; >> + } >> + >> + long limit = (1+sumLeftoverSuffix) * (long) maxPrefixCount; >> + >> + assertTrue("maxPrefixCount=" + maxPrefixCount + " prefix=" + >> prefix + " sumLeftoverSuffix=" + sumLeftoverSuffix + " limit=" + limit + " >> vs actual=" +ent.getValue(), >> + ent.getValue() <= limit); >> + } >> + >> + sum += ent.getValue(); >> + } >> + >> + // Make sure no test bug: >> + assertEquals(totPrefixCount, sum); >> + } >> + } >> + >> + /** Make sure you get clear exc. if you try to use this within anything >> but IndexOptions.DOCS fields. */ >> + public void testWithFreqs() throws Exception { >> + Directory dir = newDirectory(); >> + IndexWriterConfig iwc = new IndexWriterConfig(new >> MockAnalyzer(random())); >> + iwc.setCodec(codec); >> + IndexWriter w = new IndexWriter(dir, iwc); >> + FieldType ft = new FieldType(StringField.TYPE_NOT_STORED); >> + ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); >> + Document doc = new Document(); >> + doc.add(new Field("foo", "bar bar", ft)); >> + w.addDocument(doc); >> + try { >> + w.commit(); >> + } catch (IllegalStateException ise) { >> + assertEquals("ranges can only be indexed with IndexOptions.DOCS >> (field: foo)", ise.getMessage()); >> + } >> + w.close(); >> + dir.close(); >> + } >> + >> + /** Make sure you get clear exc. if you try to use this within anything >> but IndexOptions.DOCS fields. */ >> + public void testWithPositions() throws Exception { >> + Directory dir = newDirectory(); >> + IndexWriterConfig iwc = new IndexWriterConfig(new >> MockAnalyzer(random())); >> + iwc.setCodec(codec); >> + IndexWriter w = new IndexWriter(dir, iwc); >> + FieldType ft = new FieldType(StringField.TYPE_NOT_STORED); >> + ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); >> + Document doc = new Document(); >> + doc.add(new Field("foo", "bar bar", ft)); >> + w.addDocument(doc); >> + try { >> + w.commit(); >> + } catch (IllegalStateException ise) { >> + assertEquals("ranges can only be indexed with IndexOptions.DOCS >> (field: foo)", ise.getMessage()); >> + } >> + w.close(); >> + dir.close(); >> + } >> + >> + /** Make sure you get clear exc. if you try to use this within anything >> but IndexOptions.DOCS fields. */ >> + public void testWithOffsets() throws Exception { >> + Directory dir = newDirectory(); >> + IndexWriterConfig iwc = new IndexWriterConfig(new >> MockAnalyzer(random())); >> + iwc.setCodec(codec); >> + IndexWriter w = new IndexWriter(dir, iwc); >> + FieldType ft = new FieldType(StringField.TYPE_NOT_STORED); >> + >> ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); >> + Document doc = new Document(); >> + doc.add(new Field("foo", "bar bar", ft)); >> + w.addDocument(doc); >> + try { >> + w.commit(); >> + } catch (IllegalStateException ise) { >> + assertEquals("ranges can only be indexed with IndexOptions.DOCS >> (field: foo)", ise.getMessage()); >> + } >> + w.close(); >> + dir.close(); >> + } >> +} >> >> Modified: >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java?rev=1670918&r1=1670917&r2=1670918&view=diff >> ============================================================================== >> --- >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java >> (original) >> +++ >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java >> Thu Apr 2 15:05:48 2015 >> @@ -16,6 +16,7 @@ package org.apache.lucene.codecs; >> * limitations under the License. >> */ >> >> +import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader; // javadocs >> import org.apache.lucene.index.OrdTermState; >> import org.apache.lucene.index.TermState; >> >> @@ -23,6 +24,8 @@ import org.apache.lucene.index.TermState >> * Holds all state required for {@link PostingsReaderBase} >> * to produce a {@link org.apache.lucene.index.PostingsEnum} without >> re-seeking the >> * terms dict. >> + * >> + * @lucene.internal >> */ >> public class BlockTermState extends OrdTermState { >> /** how many docs have this term */ >> @@ -36,6 +39,11 @@ public class BlockTermState extends OrdT >> // TODO: update BTR to nuke this >> public long blockFilePointer; >> >> + /** True if this term is "real" (e.g., not an auto-prefix term or >> + * some other "secret" term; currently only {@link BlockTreeTermsReader} >> + * sets this). */ >> + public boolean isRealTerm; >> + >> /** Sole constructor. (For invocation by subclass >> * constructors, typically implicit.) */ >> protected BlockTermState() { >> @@ -50,10 +58,11 @@ public class BlockTermState extends OrdT >> totalTermFreq = other.totalTermFreq; >> termBlockOrd = other.termBlockOrd; >> blockFilePointer = other.blockFilePointer; >> + isRealTerm = other.isRealTerm; >> } >> >> @Override >> public String toString() { >> - return "docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " >> termBlockOrd=" + termBlockOrd + " blockFP=" + blockFilePointer; >> + return "docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " >> termBlockOrd=" + termBlockOrd + " blockFP=" + blockFilePointer + " >> isRealTerm=" + isRealTerm; >> } >> } >> >> Modified: >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java?rev=1670918&r1=1670917&r2=1670918&view=diff >> ============================================================================== >> --- >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java >> (original) >> +++ >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java >> Thu Apr 2 15:05:48 2015 >> @@ -62,6 +62,7 @@ public abstract class PostingsFormat imp >> * @param name must be all ascii alphanumeric, and less than 128 >> characters in length. >> */ >> protected PostingsFormat(String name) { >> + // TODO: can we somehow detect name conflicts here? Two different >> classes trying to claim the same name? Otherwise you see confusing errors... >> NamedSPILoader.checkServiceName(name); >> this.name = name; >> } >> >> Added: >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java?rev=1670918&view=auto >> ============================================================================== >> --- >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java >> (added) >> +++ >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java >> Thu Apr 2 15:05:48 2015 >> @@ -0,0 +1,415 @@ >> +package org.apache.lucene.codecs.blocktree; >> + >> +/* >> + * Licensed to the Apache Software Foundation (ASF) under one or more >> + * contributor license agreements. See the NOTICE file distributed with >> + * this work for additional information regarding copyright ownership. >> + * The ASF licenses this file to You under the Apache License, Version 2.0 >> + * (the "License"); you may not use this file except in compliance with >> + * the License. You may obtain a copy of the License at >> + * >> + * http://www.apache.org/licenses/LICENSE-2.0 >> + * >> + * Unless required by applicable law or agreed to in writing, software >> + * distributed under the License is distributed on an "AS IS" BASIS, >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> + * See the License for the specific language governing permissions and >> + * limitations under the License. >> + */ >> + >> +import java.io.IOException; >> +import java.util.ArrayList; >> +import java.util.Collections; >> +import java.util.List; >> + >> +import org.apache.lucene.index.FilteredTermsEnum; >> +import org.apache.lucene.index.Terms; >> +import org.apache.lucene.index.TermsEnum; >> +import org.apache.lucene.util.ArrayUtil; >> +import org.apache.lucene.util.BytesRef; >> +import org.apache.lucene.util.BytesRefBuilder; >> +import org.apache.lucene.util.StringHelper; >> + >> +// TODO: instead of inlining auto-prefix terms with normal terms, >> +// we could write them into their own virtual/private field. This >> +// would make search time a bit more complex, since we'd need to >> +// merge sort between two TermEnums, but it would also make stats >> +// API (used by CheckIndex -verbose) easier to implement since we could >> +// just walk this virtual field and gather its stats) >> + >> +/** Used in the first pass when writing a segment to locate >> + * "appropriate" auto-prefix terms to pre-compile into the index. >> + * This visits every term in the index to find prefixes that >> + * match >= min and <= max number of terms. */ >> + >> +class AutoPrefixTermsWriter { >> + >> + //static boolean DEBUG = BlockTreeTermsWriter.DEBUG; >> + //static boolean DEBUG = false; >> + //static boolean DEBUG2 = BlockTreeTermsWriter.DEBUG2; >> + //static boolean DEBUG2 = true; >> + >> + /** Describes a range of term-space to match, either a simple prefix >> + * (foo*) or a floor-block range of a prefix (e.g. foo[a-m]*, >> + * foo[n-z]*) when there are too many terms starting with foo*. */ >> + public static final class PrefixTerm implements Comparable<PrefixTerm> { >> + /** Common prefix */ >> + public final byte[] prefix; >> + >> + /** If this is -2, this is a normal prefix (foo *), else it's the >> minimum lead byte of the suffix (e.g. 'd' in foo[d-m]*). */ >> + public final int floorLeadStart; >> + >> + /** The lead byte (inclusive) of the suffix for the term range we match >> (e.g. 'm' in foo[d-m*]); this is ignored when >> + * floorLeadStart is -2. */ >> + public final int floorLeadEnd; >> + >> + public final BytesRef term; >> + >> + /** Sole constructor. */ >> + public PrefixTerm(byte[] prefix, int floorLeadStart, int floorLeadEnd) { >> + this.prefix = prefix; >> + this.floorLeadStart = floorLeadStart; >> + this.floorLeadEnd = floorLeadEnd; >> + this.term = toBytesRef(prefix, floorLeadStart); >> + >> + assert floorLeadEnd >= floorLeadStart; >> + assert floorLeadEnd >= 0; >> + assert floorLeadStart == -2 || floorLeadStart >= 0; >> + >> + // We should never create empty-string prefix term: >> + assert prefix.length > 0 || floorLeadStart != -2 || floorLeadEnd != >> 0xff; >> + } >> + >> + @Override >> + public String toString() { >> + String s = brToString(new BytesRef(prefix)); >> + if (floorLeadStart == -2) { >> + s += "[-" + Integer.toHexString(floorLeadEnd) + "]"; >> + } else { >> + s += "[" + Integer.toHexString(floorLeadStart) + "-" + >> Integer.toHexString(floorLeadEnd) + "]"; >> + } >> + return s; >> + } >> + >> + @Override >> + public int compareTo(PrefixTerm other) { >> + int cmp = term.compareTo(other.term); >> + if (cmp == 0) { >> + if (prefix.length != other.prefix.length) { >> + return prefix.length - other.prefix.length; >> + } >> + >> + // On tie, sort the bigger floorLeadEnd, earlier, since it >> + // spans more terms, so during intersect, we want to encounter this >> one >> + // first so we can use it if the automaton accepts the larger range: >> + cmp = other.floorLeadEnd - floorLeadEnd; >> + } >> + >> + return cmp; >> + } >> + >> + /** Returns the leading term for this prefix term, e.g. "foo" (for >> + * the foo* prefix) or "foom" (for the foo[m-z]* case). */ >> + private static BytesRef toBytesRef(byte[] prefix, int floorLeadStart) { >> + BytesRef br; >> + if (floorLeadStart != -2) { >> + assert floorLeadStart >= 0; >> + br = new BytesRef(prefix.length+1); >> + } else { >> + br = new BytesRef(prefix.length); >> + } >> + System.arraycopy(prefix, 0, br.bytes, 0, prefix.length); >> + br.length = prefix.length; >> + if (floorLeadStart != -2) { >> + assert floorLeadStart >= 0; >> + br.bytes[br.length++] = (byte) floorLeadStart; >> + } >> + >> + return br; >> + } >> + >> + public int compareTo(BytesRef term) { >> + return this.term.compareTo(term); >> + } >> + >> + public TermsEnum getTermsEnum(TermsEnum in) { >> + >> + final BytesRef prefixRef = new BytesRef(prefix); >> + >> + return new FilteredTermsEnum(in) { >> + { >> + setInitialSeekTerm(term); >> + } >> + >> + @Override >> + protected AcceptStatus accept(BytesRef term) { >> + if (StringHelper.startsWith(term, prefixRef) && >> + (floorLeadEnd == -1 || term.length == prefixRef.length || >> (term.bytes[term.offset + prefixRef.length] & 0xff) <= floorLeadEnd)) { >> + return AcceptStatus.YES; >> + } else { >> + return AcceptStatus.END; >> + } >> + } >> + }; >> + } >> + } >> + >> + // for debugging >> + static String brToString(BytesRef b) { >> + try { >> + return b.utf8ToString() + " " + b; >> + } catch (Throwable t) { >> + // If BytesRef isn't actually UTF8, or it's eg a >> + // prefix of UTF8 that ends mid-unicode-char, we >> + // fallback to hex: >> + return b.toString(); >> + } >> + } >> + >> + final List<PrefixTerm> prefixes = new ArrayList<>(); >> + private final int minItemsInPrefix; >> + private final int maxItemsInPrefix; >> + >> + // Records index into pending where the current prefix at that >> + // length "started"; for example, if current term starts with 't', >> + // startsByPrefix[0] is the index into pending for the first >> + // term/sub-block starting with 't'. We use this to figure out when >> + // to write a new block: >> + private final BytesRefBuilder lastTerm = new BytesRefBuilder(); >> + private int[] prefixStarts = new int[8]; >> + private List<Object> pending = new ArrayList<>(); >> + >> + //private final String segment; >> + >> + public AutoPrefixTermsWriter(Terms terms, int minItemsInPrefix, int >> maxItemsInPrefix) throws IOException { >> + this.minItemsInPrefix = minItemsInPrefix; >> + this.maxItemsInPrefix = maxItemsInPrefix; >> + //this.segment = segment; >> + >> + TermsEnum termsEnum = terms.iterator(null); >> + while (true) { >> + BytesRef term = termsEnum.next(); >> + if (term == null) { >> + break; >> + } >> + //if (DEBUG) System.out.println("pushTerm: " + brToString(term)); >> + pushTerm(term); >> + } >> + >> + if (pending.size() > 1) { >> + pushTerm(BlockTreeTermsWriter.EMPTY_BYTES_REF); >> + >> + // Also maybe save floor prefixes in root block; this can be a >> biggish perf gain for large ranges: >> + /* >> + System.out.println("root block pending.size=" + pending.size()); >> + for(Object o : pending) { >> + System.out.println(" " + o); >> + } >> + */ >> + while (pending.size() >= minItemsInPrefix) { >> + savePrefixes(0, pending.size()); >> + } >> + } >> + >> + Collections.sort(prefixes); >> + } >> + >> + /** Pushes the new term to the top of the stack, and writes new blocks. */ >> + private void pushTerm(BytesRef text) throws IOException { >> + int limit = Math.min(lastTerm.length(), text.length); >> + >> + // Find common prefix between last term and current term: >> + int pos = 0; >> + while (pos < limit && lastTerm.byteAt(pos) == >> text.bytes[text.offset+pos]) { >> + pos++; >> + } >> + >> + //if (DEBUG) System.out.println(" shared=" + pos + " >> lastTerm.length=" + lastTerm.length()); >> + >> + // Close the "abandoned" suffix now: >> + for(int i=lastTerm.length()-1;i>=pos;i--) { >> + >> + // How many items on top of the stack share the current suffix >> + // we are closing: >> + int prefixTopSize = pending.size() - prefixStarts[i]; >> + >> + while (prefixTopSize >= minItemsInPrefix) { >> + //if (DEBUG) System.out.println("pushTerm i=" + i + " >> prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix); >> + savePrefixes(i+1, prefixTopSize); >> + //prefixStarts[i] -= prefixTopSize; >> + //System.out.println(" after savePrefixes: " + (pending.size() - >> prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + >> prefixStarts[i]); >> + >> + // For large floor blocks, it's possible we should now re-run on >> the new prefix terms we just created: >> + prefixTopSize = pending.size() - prefixStarts[i]; >> + } >> + } >> + >> + if (prefixStarts.length < text.length) { >> + prefixStarts = ArrayUtil.grow(prefixStarts, text.length); >> + } >> + >> + // Init new tail: >> + for(int i=pos;i<text.length;i++) { >> + prefixStarts[i] = pending.size(); >> + } >> + >> + lastTerm.copyBytes(text); >> + >> + // Only append the first (optional) empty string, no the fake last one >> used to close all prefixes: >> + if (text.length > 0 || pending.isEmpty()) { >> + byte[] termBytes = new byte[text.length]; >> + System.arraycopy(text.bytes, text.offset, termBytes, 0, text.length); >> + pending.add(termBytes); >> + } >> + } >> + >> + void savePrefixes(int prefixLength, int count) throws IOException { >> + >> + assert count > 0; >> + >> + //if (DEBUG2) { >> + // BytesRef br = new BytesRef(lastTerm.bytes()); >> + // br.length = prefixLength; >> + // System.out.println(" savePrefixes: seg=" + segment + " " + >> brToString(br) + " count=" + count + " pending.size()=" + pending.size()); >> + //} >> + >> + int lastSuffixLeadLabel = -2; >> + >> + int start = pending.size()-count; >> + assert start >=0; >> + >> + int end = pending.size(); >> + int nextBlockStart = start; >> + int nextFloorLeadLabel = -1; >> + int prefixCount = 0; >> + int pendingCount = 0; >> + PrefixTerm lastPTEntry = null; >> + for (int i=start; i<end; i++) { >> + >> + byte[] termBytes; >> + Object o = pending.get(i); >> + PrefixTerm ptEntry; >> + if (o instanceof byte[]) { >> + ptEntry = null; >> + termBytes = (byte[]) o; >> + } else { >> + ptEntry = (PrefixTerm) o; >> + termBytes = ptEntry.term.bytes; >> + if (ptEntry.prefix.length != prefixLength) { >> + assert ptEntry.prefix.length > prefixLength; >> + ptEntry = null; >> + } >> + } >> + pendingCount++; >> + >> + //if (DEBUG) System.out.println(" check term=" + brToString(new >> BytesRef(termBytes))); >> + >> + int suffixLeadLabel; >> + >> + if (termBytes.length == prefixLength) { >> + // Suffix is 0, i.e. prefix 'foo' and term is >> + // 'foo' so the term has empty string suffix >> + // in this block >> + assert lastSuffixLeadLabel == -2; >> + suffixLeadLabel = -2; >> + } else { >> + suffixLeadLabel = termBytes[prefixLength] & 0xff; >> + } >> + >> + // if (DEBUG) System.out.println(" i=" + i + " ent=" + ent + " >> suffixLeadLabel=" + suffixLeadLabel); >> + >> + if (suffixLeadLabel != lastSuffixLeadLabel) { >> + // This is a boundary, a chance to make an auto-prefix term if we >> want: >> + >> + // When we are "recursing" (generating auto-prefix terms on a block >> of >> + // floor'd auto-prefix terms), this assert is non-trivial because it >> + // ensures the floorLeadEnd of the previous terms is in fact less >> + // than the lead start of the current entry: >> + assert suffixLeadLabel > lastSuffixLeadLabel: "suffixLeadLabel=" + >> suffixLeadLabel + " vs lastSuffixLeadLabel=" + lastSuffixLeadLabel; >> + >> + // NOTE: must check nextFloorLeadLabel in case minItemsInPrefix is >> 2 and prefix is 'a' and we've seen 'a' and then 'aa' >> + if (pendingCount >= minItemsInPrefix && end-nextBlockStart > >> maxItemsInPrefix && nextFloorLeadLabel != -1) { >> + // The count is too large for one block, so we must break it into >> "floor" blocks, where we record >> + // the leading label of the suffix of the first term in each >> floor block, so at search time we can >> + // jump to the right floor block. We just use a naive greedy >> segmenter here: make a new floor >> + // block as soon as we have at least minItemsInBlock. This is >> not always best: it often produces >> + // a too-small block as the final block: >> + >> + // If the last entry was another prefix term of the same length, >> then it represents a range of terms, so we must use its ending >> + // prefix label as our ending label: >> + if (lastPTEntry != null) { >> + lastSuffixLeadLabel = lastPTEntry.floorLeadEnd; >> + } >> + >> + savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel); >> + pendingCount = 0; >> + >> + prefixCount++; >> + nextFloorLeadLabel = suffixLeadLabel; >> + nextBlockStart = i; >> + } >> + >> + if (nextFloorLeadLabel == -1) { >> + nextFloorLeadLabel = suffixLeadLabel; >> + //if (DEBUG) System.out.println("set first lead label=" + >> nextFloorLeadLabel); >> + } >> + >> + lastSuffixLeadLabel = suffixLeadLabel; >> + } >> + lastPTEntry = ptEntry; >> + } >> + >> + // Write last block, if any: >> + if (nextBlockStart < end) { >> + //System.out.println(" lastPTEntry=" + lastPTEntry + " >> lastSuffixLeadLabel=" + lastSuffixLeadLabel); >> + if (lastPTEntry != null) { >> + lastSuffixLeadLabel = lastPTEntry.floorLeadEnd; >> + } >> + assert lastSuffixLeadLabel >= nextFloorLeadLabel: >> "lastSuffixLeadLabel=" + lastSuffixLeadLabel + " nextFloorLeadLabel=" + >> nextFloorLeadLabel; >> + if (prefixCount == 0) { >> + if (prefixLength > 0) { >> + savePrefix(prefixLength, -2, 0xff); >> + prefixCount++; >> + } else { >> + // Don't add a prefix term for all terms in the index! >> + } >> + } else { >> + if (lastSuffixLeadLabel == -2) { >> + // Special case when closing the empty string root block: >> + lastSuffixLeadLabel = 0xff; >> + } >> + savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel); >> + prefixCount++; >> + } >> + } >> + >> + // Remove slice from the top of the pending stack, that we just wrote: >> + int sizeToClear = count; >> + if (prefixCount > 1) { >> + Object o = pending.get(pending.size()-count); >> + if (o instanceof byte[] && ((byte[]) o).length == prefixLength) { >> + // If we were just asked to write all f* terms, but there were too >> many and so we made floor blocks, the exact term 'f' will remain >> + // as its own item, followed by floor block terms like f[a-m]*, >> f[n-z]*, so in this case we leave 3 (not 2) items on the pending stack: >> + sizeToClear--; >> + } >> + } >> + pending.subList(pending.size()-sizeToClear, pending.size()).clear(); >> + >> + // Append prefix terms for each prefix, since these count like real >> terms that also need to be "rolled up": >> + for(int i=0;i<prefixCount;i++) { >> + PrefixTerm pt = prefixes.get(prefixes.size()-(prefixCount-i)); >> + pending.add(pt); >> + } >> + } >> + >> + private void savePrefix(int prefixLength, int floorLeadStart, int >> floorLeadEnd) { >> + byte[] prefix = new byte[prefixLength]; >> + System.arraycopy(lastTerm.bytes(), 0, prefix, 0, prefixLength); >> + assert floorLeadStart != -1; >> + assert floorLeadEnd != -1; >> + >> + PrefixTerm pt = new PrefixTerm(prefix, floorLeadStart, floorLeadEnd); >> + //if (DEBUG2) System.out.println(" savePrefix: seg=" + segment + " " >> + pt + " count=" + count); >> + prefixes.add(pt); >> + } >> +} >> >> Added: >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetPostingsEnum.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetPostingsEnum.java?rev=1670918&view=auto >> ============================================================================== >> --- >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetPostingsEnum.java >> (added) >> +++ >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetPostingsEnum.java >> Thu Apr 2 15:05:48 2015 >> @@ -0,0 +1,95 @@ >> +package org.apache.lucene.codecs.blocktree; >> + >> +/* >> + * Licensed to the Apache Software Foundation (ASF) under one or more >> + * contributor license agreements. See the NOTICE file distributed with >> + * this work for additional information regarding copyright ownership. >> + * The ASF licenses this file to You under the Apache License, Version 2.0 >> + * (the "License"); you may not use this file except in compliance with >> + * the License. You may obtain a copy of the License at >> + * >> + * http://www.apache.org/licenses/LICENSE-2.0 >> + * >> + * Unless required by applicable law or agreed to in writing, software >> + * distributed under the License is distributed on an "AS IS" BASIS, >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> + * See the License for the specific language governing permissions and >> + * limitations under the License. >> + */ >> + >> +import java.io.IOException; >> + >> +import org.apache.lucene.index.PostingsEnum; >> +import org.apache.lucene.search.DocIdSetIterator; >> +import org.apache.lucene.util.BitSet; >> +import org.apache.lucene.util.BitSetIterator; >> +import org.apache.lucene.util.BytesRef; >> +import org.apache.lucene.util.FixedBitSet; // javadocs >> + >> +/** Takes a {@link FixedBitSet} and creates a DOCS {@link PostingsEnum} >> from it. */ >> + >> +class BitSetPostingsEnum extends PostingsEnum { >> + private final BitSet bits; >> + private DocIdSetIterator in; >> + >> + BitSetPostingsEnum(BitSet bits) { >> + this.bits = bits; >> + reset(); >> + } >> + >> + @Override >> + public int freq() throws IOException { >> + return 1; >> + } >> + >> + @Override >> + public int docID() { >> + if (in == null) { >> + return -1; >> + } else { >> + return in.docID(); >> + } >> + } >> + >> + @Override >> + public int nextDoc() throws IOException { >> + if (in == null) { >> + in = new BitSetIterator(bits, 0); >> + } >> + return in.nextDoc(); >> + } >> + >> + @Override >> + public int advance(int target) throws IOException { >> + return in.advance(target); >> + } >> + >> + @Override >> + public long cost() { >> + return in.cost(); >> + } >> + >> + void reset() { >> + in = null; >> + } >> + >> + @Override >> + public BytesRef getPayload() { >> + return null; >> + } >> + >> + @Override >> + public int nextPosition() { >> + throw new UnsupportedOperationException(); >> + } >> + >> + @Override >> + public int startOffset() { >> + throw new UnsupportedOperationException(); >> + } >> + >> + @Override >> + public int endOffset() { >> + throw new UnsupportedOperationException(); >> + } >> +} >> >> Added: >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetTermsEnum.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetTermsEnum.java?rev=1670918&view=auto >> ============================================================================== >> --- >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetTermsEnum.java >> (added) >> +++ >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetTermsEnum.java >> Thu Apr 2 15:05:48 2015 >> @@ -0,0 +1,87 @@ >> +package org.apache.lucene.codecs.blocktree; >> + >> +/* >> + * Licensed to the Apache Software Foundation (ASF) under one or more >> + * contributor license agreements. See the NOTICE file distributed with >> + * this work for additional information regarding copyright ownership. >> + * The ASF licenses this file to You under the Apache License, Version 2.0 >> + * (the "License"); you may not use this file except in compliance with >> + * the License. You may obtain a copy of the License at >> + * >> + * http://www.apache.org/licenses/LICENSE-2.0 >> + * >> + * Unless required by applicable law or agreed to in writing, software >> + * distributed under the License is distributed on an "AS IS" BASIS, >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> + * See the License for the specific language governing permissions and >> + * limitations under the License. >> + */ >> + >> +import org.apache.lucene.codecs.PostingsWriterBase; >> +import org.apache.lucene.index.PostingsEnum; >> +import org.apache.lucene.index.TermsEnum; >> +import org.apache.lucene.util.BitSet; >> +import org.apache.lucene.util.Bits; >> +import org.apache.lucene.util.BytesRef; >> + >> +/** Silly stub class, used only when writing an auto-prefix >> + * term in order to expose DocsEnum over a FixedBitSet. We >> + * pass this to {@link PostingsWriterBase#writeTerm} so >> + * that it can pull .docs() multiple times for the >> + * current term. */ >> + >> +class BitSetTermsEnum extends TermsEnum { >> + private final BitSetPostingsEnum postingsEnum; >> + >> + public BitSetTermsEnum(BitSet docs) { >> + postingsEnum = new BitSetPostingsEnum(docs); >> + } >> + >> + @Override >> + public SeekStatus seekCeil(BytesRef text) { >> + throw new UnsupportedOperationException(); >> + } >> + >> + @Override >> + public void seekExact(long ord) { >> + throw new UnsupportedOperationException(); >> + } >> + >> + @Override >> + public BytesRef term() { >> + throw new UnsupportedOperationException(); >> + } >> + >> + @Override >> + public BytesRef next() { >> + throw new UnsupportedOperationException(); >> + } >> + >> + @Override >> + public long ord() { >> + throw new UnsupportedOperationException(); >> + } >> + >> + @Override >> + public int docFreq() { >> + throw new UnsupportedOperationException(); >> + } >> + >> + @Override >> + public long totalTermFreq() { >> + throw new UnsupportedOperationException(); >> + } >> + >> + @Override >> + public PostingsEnum postings(Bits liveDocs, PostingsEnum reuse, int >> flags) { >> + if (flags != PostingsEnum.NONE) { >> + // We only work with DOCS_ONLY fields >> + return null; >> + } >> + if (liveDocs != null) { >> + throw new IllegalArgumentException("cannot handle live docs"); >> + } >> + postingsEnum.reset(); >> + return postingsEnum; >> + } >> +} >> >> Modified: >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java >> URL: >> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java?rev=1670918&r1=1670917&r2=1670918&view=diff >> ============================================================================== >> --- >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java >> (original) >> +++ >> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java >> Thu Apr 2 15:05:48 2015 >> @@ -34,6 +34,8 @@ import org.apache.lucene.index.IndexFile >> import org.apache.lucene.index.IndexOptions; >> import org.apache.lucene.index.SegmentReadState; >> import org.apache.lucene.index.Terms; >> +import org.apache.lucene.search.PrefixQuery; // javadocs >> +import org.apache.lucene.search.TermRangeQuery; // javadocs >> import org.apache.lucene.store.IndexInput; >> import org.apache.lucene.util.Accountable; >> import org.apache.lucene.util.Accountables; >> @@ -57,6 +59,14 @@ import org.apache.lucene.util.fst.Output >> * min/maxItemsPerBlock during indexing to control how >> * much memory the terms index uses.</p> >> * >> + * <p>If auto-prefix terms were indexed (see >> + * {@link BlockTreeTermsWriter}), then the {@link Terms#intersect} >> + * implementation here will make use of these terms only if the >> + * automaton has a binary sink state, i.e. an accept state >> + * which has a transition to itself accepting all byte values. >> + * For example, both {@link PrefixQuery} and {@link TermRangeQuery} >> + * pass such automata to {@link Terms#intersect}.</p> >> + * >> * <p>The data structure used by this implementation is very >> * similar to a burst trie >> * (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499), >> @@ -90,8 +100,11 @@ public final class BlockTreeTermsReader >> /** Initial terms format. */ >> public static final int VERSION_START = 0; >> >> + /** Auto-prefix terms. */ >> + public static final int VERSION_AUTO_PREFIX_TERMS = 1; >> + >> /** Current terms format. */ >> - public static final int VERSION_CURRENT = VERSION_START; >> + public static final int VERSION_CURRENT = VERSION_AUTO_PREFIX_TERMS; >> >> /** Extension of terms index file */ >> static final String TERMS_INDEX_EXTENSION = "tip"; >> @@ -116,7 +129,7 @@ public final class BlockTreeTermsReader >> >> final String segment; >> >> - private final int version; >> + final int version; >> >> /** Sole constructor. */ >> public BlockTreeTermsReader(PostingsReaderBase postingsReader, >> SegmentReadState state) throws IOException { >> >> > > --------------------------------------------------------------------- > To unsubscribe, e-mail: [email protected] > For additional commands, e-mail: [email protected] > --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
