NOTE: i hit compile failure like this (TermRangeTermsEnum got removed).
I am going to remove these asserts: to me they don't look very useful,
and fix the build for now.
compile-test:
[mkdir] Created dir:
/home/rmuir/workspace/trunk/lucene/build/core/classes/test
[javac] Compiling 431 source files to
/home/rmuir/workspace/trunk/lucene/build/core/classes/test
[javac]
/home/rmuir/workspace/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java:123:
error: cannot find symbol
[javac] assertFalse(query.getTermsEnum(terms) instanceof
TermRangeTermsEnum);
[javac] ^
[javac] symbol: class TermRangeTermsEnum
[javac] location: class TestTermRangeQuery
[javac]
/home/rmuir/workspace/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java:128:
error: cannot find symbol
[javac] assertFalse(query.getTermsEnum(terms) instanceof
TermRangeTermsEnum);
[javac] ^
[javac] symbol: class TermRangeTermsEnum
[javac] location: class TestTermRangeQuery
[javac]
/home/rmuir/workspace/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java:132:
error: cannot find symbol
[javac] assertFalse(query.getTermsEnum(terms) instanceof
TermRangeTermsEnum);
[javac] ^
[javac] symbol: class TermRangeTermsEnum
[javac] location: class TestTermRangeQuery
[javac] Note: Some input files use or override a deprecated API.
[javac] Note: Recompile with -Xlint:deprecation for details.
[javac] 3 errors
On Thu, Apr 2, 2015 at 11:05 AM, <[email protected]> wrote:
> Author: mikemccand
> Date: Thu Apr 2 15:05:48 2015
> New Revision: 1670918
>
> URL: http://svn.apache.org/r1670918
> Log:
> LUCENE-5879: add auto-prefix terms to block tree, and experimental
> AutoPrefixTermsPostingsFormat
>
> Added:
>
> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/
>
> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/AutoPrefixPostingsFormat.java
> (with props)
>
> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/package-info.java
> (with props)
>
> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/
>
> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixPostingsFormat.java
> (with props)
>
> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixTerms.java
> (with props)
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java
> (with props)
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetPostingsEnum.java
> (with props)
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetTermsEnum.java
> (with props)
>
> lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java
> (with props)
> Removed:
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/TermRangeTermsEnum.java
> Modified:
> lucene/dev/trunk/lucene/CHANGES.txt
>
> lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnum.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TermContext.java
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/Terms.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/TermRangeQuery.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
>
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
>
> lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
>
> lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java
>
> lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java
>
> lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java
>
> lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java
>
> lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
>
> lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/AssertingLeafReader.java
>
> lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java
>
> lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
>
> Modified: lucene/dev/trunk/lucene/CHANGES.txt
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1670918&r1=1670917&r2=1670918&view=diff
> ==============================================================================
> --- lucene/dev/trunk/lucene/CHANGES.txt (original)
> +++ lucene/dev/trunk/lucene/CHANGES.txt Thu Apr 2 15:05:48 2015
> @@ -19,6 +19,10 @@ New Features
> for counting ranges that align with the underlying terms as defined by the
> NumberRangePrefixTree (e.g. familiar date units like days). (David Smiley)
>
> +* LUCENE-5879: Added experimental auto-prefix terms to BlockTree terms
> + dictionary, exposed as AutoPrefixPostingsFormat (Adrien Grand,
> + Uwe Schindler, Robert Muir, Mike McCandless)
> +
> API Changes
>
> * LUCENE-3312: The API of oal.document was restructured to
>
> Added:
> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/AutoPrefixPostingsFormat.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/AutoPrefixPostingsFormat.java?rev=1670918&view=auto
> ==============================================================================
> ---
> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/AutoPrefixPostingsFormat.java
> (added)
> +++
> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/AutoPrefixPostingsFormat.java
> Thu Apr 2 15:05:48 2015
> @@ -0,0 +1,125 @@
> +package org.apache.lucene.codecs.autoprefix;
> +
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +import java.io.IOException;
> +
> +import org.apache.lucene.codecs.FieldsConsumer;
> +import org.apache.lucene.codecs.FieldsProducer;
> +import org.apache.lucene.codecs.PostingsFormat;
> +import org.apache.lucene.codecs.PostingsReaderBase;
> +import org.apache.lucene.codecs.PostingsWriterBase;
> +import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader;
> +import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
> +import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat;
> +import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader;
> +import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter;
> +import org.apache.lucene.index.SegmentReadState;
> +import org.apache.lucene.index.SegmentWriteState;
> +import org.apache.lucene.util.IOUtils;
> +
> +/**
> + * Just like {@link Lucene50PostingsFormat} except this format
> + * exposes the experimental auto-prefix terms.
> + *
> + * @lucene.experimental
> + */
> +
> +public final class AutoPrefixPostingsFormat extends PostingsFormat {
> +
> + private final int minItemsInBlock;
> + private final int maxItemsInBlock;
> + private final int minItemsInAutoPrefix;
> + private final int maxItemsInAutoPrefix;
> +
> + /** Creates {@code AutoPrefixPostingsFormat} with default settings. */
> + public AutoPrefixPostingsFormat() {
> + this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
> + BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE,
> + 25, 48);
> + }
> +
> + /** Creates {@code Lucene50PostingsFormat} with custom
> + * values for {@code minBlockSize} and {@code
> + * maxBlockSize} passed to block terms dictionary.
> + * @see
> BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
> */
> + public AutoPrefixPostingsFormat(int minItemsInAutoPrefix, int
> maxItemsInAutoPrefix) {
> + this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
> + BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE,
> + minItemsInAutoPrefix,
> + maxItemsInAutoPrefix);
> + }
> +
> + /** Creates {@code Lucene50PostingsFormat} with custom
> + * values for {@code minBlockSize}, {@code
> + * maxBlockSize}, {@code minItemsInAutoPrefix} and {@code
> maxItemsInAutoPrefix}, passed
> + * to block tree terms dictionary.
> + * @see
> BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int,int,int)
> */
> + public AutoPrefixPostingsFormat(int minItemsInBlock, int maxItemsInBlock,
> int minItemsInAutoPrefix, int maxItemsInAutoPrefix) {
> + super("AutoPrefix");
> + BlockTreeTermsWriter.validateSettings(minItemsInBlock,
> + maxItemsInBlock);
> + BlockTreeTermsWriter.validateAutoPrefixSettings(minItemsInAutoPrefix,
> + maxItemsInAutoPrefix);
> + this.minItemsInBlock = minItemsInBlock;
> + this.maxItemsInBlock = maxItemsInBlock;
> + this.minItemsInAutoPrefix = minItemsInAutoPrefix;
> + this.maxItemsInAutoPrefix = maxItemsInAutoPrefix;
> + }
> +
> + @Override
> + public String toString() {
> + return getName();
> + }
> +
> + @Override
> + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws
> IOException {
> + PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state);
> +
> + boolean success = false;
> + try {
> + FieldsConsumer ret = new BlockTreeTermsWriter(state,
> + postingsWriter,
> + minItemsInBlock,
> + maxItemsInBlock,
> + minItemsInAutoPrefix,
> + maxItemsInAutoPrefix);
> + success = true;
> + return ret;
> + } finally {
> + if (!success) {
> + IOUtils.closeWhileHandlingException(postingsWriter);
> + }
> + }
> + }
> +
> + @Override
> + public FieldsProducer fieldsProducer(SegmentReadState state) throws
> IOException {
> + PostingsReaderBase postingsReader = new Lucene50PostingsReader(state);
> + boolean success = false;
> + try {
> + FieldsProducer ret = new BlockTreeTermsReader(postingsReader, state);
> + success = true;
> + return ret;
> + } finally {
> + if (!success) {
> + IOUtils.closeWhileHandlingException(postingsReader);
> + }
> + }
> + }
> +}
>
> Added:
> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/package-info.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/package-info.java?rev=1670918&view=auto
> ==============================================================================
> ---
> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/package-info.java
> (added)
> +++
> lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/package-info.java
> Thu Apr 2 15:05:48 2015
> @@ -0,0 +1,22 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +/**
> + * An experimental postings format that automatically indexes appropriate
> + * prefix terms for fast range and prefix queries.
> + */
> +package org.apache.lucene.codecs.autoprefix;
>
> Modified:
> lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat?rev=1670918&r1=1670917&r2=1670918&view=diff
> ==============================================================================
> ---
> lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
> (original)
> +++
> lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
> Thu Apr 2 15:05:48 2015
> @@ -20,3 +20,4 @@ org.apache.lucene.codecs.memory.FSTOrdPo
> org.apache.lucene.codecs.memory.FSTPostingsFormat
> org.apache.lucene.codecs.memory.MemoryPostingsFormat
> org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
> +org.apache.lucene.codecs.autoprefix.AutoPrefixPostingsFormat
>
> Added:
> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixPostingsFormat.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixPostingsFormat.java?rev=1670918&view=auto
> ==============================================================================
> ---
> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixPostingsFormat.java
> (added)
> +++
> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixPostingsFormat.java
> Thu Apr 2 15:05:48 2015
> @@ -0,0 +1,38 @@
> +package org.apache.lucene.codecs.autoprefix;
> +
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +import org.apache.lucene.index.IndexOptions;
> +import org.apache.lucene.index.RandomPostingsTester;
> +import org.apache.lucene.util.LuceneTestCase;
> +import org.apache.lucene.util.TestUtil;
> +
> +/**
> + * Tests AutoPrefix's postings
> + */
> +
> +// NOTE: we don't extend BasePostingsFormatTestCase becase we can only
> handle DOCS_ONLY fields:
> +
> +public class TestAutoPrefixPostingsFormat extends LuceneTestCase {
> + public void test() throws Exception {
> + new
> RandomPostingsTester(random()).testFull(TestUtil.alwaysPostingsFormat(new
> AutoPrefixPostingsFormat()),
> + createTempDir("autoprefix"),
> + IndexOptions.DOCS,
> + false);
> + }
> +}
>
> Added:
> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixTerms.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixTerms.java?rev=1670918&view=auto
> ==============================================================================
> ---
> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixTerms.java
> (added)
> +++
> lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixTerms.java
> Thu Apr 2 15:05:48 2015
> @@ -0,0 +1,738 @@
> +package org.apache.lucene.codecs.autoprefix;
> +
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +import java.util.ArrayList;
> +import java.util.Collections;
> +import java.util.HashMap;
> +import java.util.HashSet;
> +import java.util.List;
> +import java.util.Locale;
> +import java.util.Map;
> +import java.util.Set;
> +
> +import org.apache.lucene.analysis.MockAnalyzer;
> +import org.apache.lucene.analysis.TokenStream;
> +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
> +import org.apache.lucene.codecs.Codec;
> +import org.apache.lucene.document.BinaryDocValuesField;
> +import org.apache.lucene.document.Document;
> +import org.apache.lucene.document.Field;
> +import org.apache.lucene.document.FieldType;
> +import org.apache.lucene.document.NumericDocValuesField;
> +import org.apache.lucene.document.StringField;
> +import org.apache.lucene.index.BinaryDocValues;
> +import org.apache.lucene.index.DirectoryReader;
> +import org.apache.lucene.index.IndexOptions;
> +import org.apache.lucene.index.IndexReader;
> +import org.apache.lucene.index.IndexWriter;
> +import org.apache.lucene.index.IndexWriterConfig;
> +import org.apache.lucene.index.MultiDocValues;
> +import org.apache.lucene.index.MultiFields;
> +import org.apache.lucene.index.NumericDocValues;
> +import org.apache.lucene.index.PostingsEnum;
> +import org.apache.lucene.index.SerialMergeScheduler;
> +import org.apache.lucene.index.Term;
> +import org.apache.lucene.index.Terms;
> +import org.apache.lucene.index.TermsEnum;
> +import org.apache.lucene.search.MultiTermQuery;
> +import org.apache.lucene.search.PrefixQuery;
> +import org.apache.lucene.store.Directory;
> +import org.apache.lucene.util.AttributeImpl;
> +import org.apache.lucene.util.BytesRef;
> +import org.apache.lucene.util.FixedBitSet;
> +import org.apache.lucene.util.LuceneTestCase;
> +import org.apache.lucene.util.StringHelper;
> +import org.apache.lucene.util.TestUtil;
> +import org.apache.lucene.util.automaton.Automata;
> +import org.apache.lucene.util.automaton.CompiledAutomaton;
> +
> +public class TestAutoPrefixTerms extends LuceneTestCase {
> +
> + private int minItemsPerBlock = TestUtil.nextInt(random(), 2, 100);
> + private int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) +
> random().nextInt(100);
> + private int minTermsAutoPrefix = TestUtil.nextInt(random(), 2, 100);
> + private int maxTermsAutoPrefix = random().nextBoolean() ? Math.max(2,
> (minTermsAutoPrefix-1)*2 + random().nextInt(100)) : Integer.MAX_VALUE;
> +
> + private final Codec codec = TestUtil.alwaysPostingsFormat(new
> AutoPrefixPostingsFormat(minItemsPerBlock, maxItemsPerBlock,
> +
> minTermsAutoPrefix, maxTermsAutoPrefix));
> +
> + // Numbers in a restricted range, encoded in decimal, left-0-padded:
> + public void testBasicNumericRanges() throws Exception {
> + Directory dir = newDirectory();
> + IndexWriterConfig iwc = new IndexWriterConfig(new
> MockAnalyzer(random()));
> + iwc.setCodec(codec);
> + IndexWriter w = new IndexWriter(dir, iwc);
> + int numTerms = TestUtil.nextInt(random(), 3000, 50000);
> + Set<String> terms = new HashSet<>();
> + int digits = TestUtil.nextInt(random(), 5, 10);
> + int maxValue = 1;
> + for(int i=0;i<digits;i++) {
> + maxValue *= 10;
> + }
> + String format = "%0" + digits + "d";
> + while (terms.size() < numTerms) {
> + terms.add(String.format(Locale.ROOT, format,
> random().nextInt(maxValue)));
> + }
> +
> + for(String term : terms) {
> + Document doc = new Document();
> + doc.add(new StringField("field", term, Field.Store.NO));
> + doc.add(new NumericDocValuesField("field", Long.parseLong(term)));
> + w.addDocument(doc);
> + }
> +
> + if (VERBOSE) System.out.println("\nTEST: now optimize");
> + if (random().nextBoolean()) {
> + w.forceMerge(1);
> + }
> +
> + if (VERBOSE) System.out.println("\nTEST: now done");
> + IndexReader r = DirectoryReader.open(w, true);
> +
> + List<String> sortedTerms = new ArrayList<>(terms);
> + Collections.sort(sortedTerms);
> +
> + if (VERBOSE) {
> + System.out.println("TEST: sorted terms:");
> + int idx = 0;
> + for(String term : sortedTerms) {
> + System.out.println(idx + ": " + term);
> + idx++;
> + }
> + }
> +
> + int iters = atLeast(100);
> + for(int iter=0;iter<iters;iter++) {
> + int min, max;
> + while (true) {
> + min = random().nextInt(maxValue);
> + max = random().nextInt(maxValue);
> + if (min == max) {
> + continue;
> + } else if (min > max) {
> + int x = min;
> + min = max;
> + max = x;
> + }
> + break;
> + }
> +
> + if (VERBOSE) {
> + System.out.println("\nTEST: iter=" + iter + " min=" + min + " max="
> + max);
> + }
> +
> + boolean minInclusive = random().nextBoolean();
> + boolean maxInclusive = random().nextBoolean();
> + BytesRef minTerm = new BytesRef(String.format(Locale.ROOT, format,
> min));
> + BytesRef maxTerm = new BytesRef(String.format(Locale.ROOT, format,
> max));
> + CompiledAutomaton ca = new
> CompiledAutomaton(Automata.makeBinaryInterval(minTerm, minInclusive, maxTerm,
> maxInclusive),
> + true, false,
> Integer.MAX_VALUE, true);
> +
> + TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field"));
> + NumericDocValues docValues = MultiDocValues.getNumericValues(r,
> "field");
> + PostingsEnum postingsEnum = null;
> +
> + VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(),
> minTerm, maxTerm);
> +
> + while (te.next() != null) {
> + if (VERBOSE) {
> + System.out.println(" got term=" + te.term().utf8ToString());
> + }
> + verifier.sawTerm(te.term());
> + postingsEnum = te.postings(null, postingsEnum);
> + int docID;
> + while ((docID = postingsEnum.nextDoc()) !=
> PostingsEnum.NO_MORE_DOCS) {
> + long v = docValues.get(docID);
> + assert v >= min && v <= max: "docID=" + docID + " v=" + v;
> + // The auto-prefix terms should never "overlap" one another, so we
> should only ever see a given docID one time:
> + if (VERBOSE) {
> + System.out.println(" got docID=" + docID + " v=" + v);
> + }
> + verifier.sawDoc(docID);
> + }
> + }
> +
> + int startLoc = Collections.binarySearch(sortedTerms,
> String.format(Locale.ROOT, format, min));
> + if (startLoc < 0) {
> + startLoc = -startLoc-1;
> + } else if (minInclusive == false) {
> + startLoc++;
> + }
> + int endLoc = Collections.binarySearch(sortedTerms,
> String.format(Locale.ROOT, format, max));
> + if (endLoc < 0) {
> + endLoc = -endLoc-2;
> + } else if (maxInclusive == false) {
> + endLoc--;
> + }
> + verifier.finish(endLoc-startLoc+1, maxTermsAutoPrefix);
> + }
> +
> + r.close();
> + w.close();
> + dir.close();
> + }
> +
> + private static BytesRef intToBytes(int v) {
> + int sortableBits = v ^ 0x80000000;
> + BytesRef token = new BytesRef(4);
> + token.length = 4;
> + int index = 3;
> + while (index >= 0) {
> + token.bytes[index] = (byte) (sortableBits & 0xff);
> + index--;
> + sortableBits >>>= 8;
> + }
> + return token;
> + }
> +
> + // Numbers are encoded in full binary (4 byte ints):
> + public void testBinaryNumericRanges() throws Exception {
> + if (VERBOSE) {
> + System.out.println("TEST: minItemsPerBlock=" + minItemsPerBlock);
> + System.out.println("TEST: maxItemsPerBlock=" + maxItemsPerBlock);
> + System.out.println("TEST: minTermsAutoPrefix=" + minTermsAutoPrefix);
> + System.out.println("TEST: maxTermsAutoPrefix=" + maxTermsAutoPrefix);
> + }
> + Directory dir = newDirectory();
> + IndexWriterConfig iwc = new IndexWriterConfig(new
> MockAnalyzer(random()));
> + iwc.setCodec(codec);
> + IndexWriter w = new IndexWriter(dir, iwc);
> + int numTerms = TestUtil.nextInt(random(), 3000, 50000);
> + Set<Integer> terms = new HashSet<>();
> + while (terms.size() < numTerms) {
> + terms.add(random().nextInt());
> + }
> +
> + for(Integer term : terms) {
> + Document doc = new Document();
> + doc.add(new BinaryField("field", intToBytes(term)));
> + doc.add(new NumericDocValuesField("field", term));
> + w.addDocument(doc);
> + }
> +
> + if (random().nextBoolean()) {
> + if (VERBOSE) System.out.println("TEST: now force merge");
> + w.forceMerge(1);
> + }
> +
> + IndexReader r = DirectoryReader.open(w, true);
> +
> + List<Integer> sortedTerms = new ArrayList<>(terms);
> + Collections.sort(sortedTerms);
> +
> + if (VERBOSE) {
> + System.out.println("TEST: sorted terms:");
> + int idx = 0;
> + for(Integer term : sortedTerms) {
> + System.out.println(idx + ": " + term);
> + idx++;
> + }
> + }
> +
> + int iters = atLeast(100);
> + for(int iter=0;iter<iters;iter++) {
> +
> + int min, max;
> + while (true) {
> + min = random().nextInt();
> + max = random().nextInt();
> + if (min == max) {
> + continue;
> + } else if (min > max) {
> + int x = min;
> + min = max;
> + max = x;
> + }
> + break;
> + }
> +
> + if (VERBOSE) {
> + System.out.println("\nTEST: iter=" + iter + " min=" + min + " (" +
> intToBytes(min) + ") max=" + max + " (" + intToBytes(max) + ")");
> + }
> +
> + boolean minInclusive = random().nextBoolean();
> + BytesRef minTerm = intToBytes(min);
> + boolean maxInclusive = random().nextBoolean();
> + BytesRef maxTerm = intToBytes(max);
> + CompiledAutomaton ca = new
> CompiledAutomaton(Automata.makeBinaryInterval(minTerm, minInclusive, maxTerm,
> maxInclusive),
> + true, false,
> Integer.MAX_VALUE, true);
> +
> + TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field"));
> + NumericDocValues docValues = MultiDocValues.getNumericValues(r,
> "field");
> + PostingsEnum postingsEnum = null;
> + VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(),
> minTerm, maxTerm);
> + while (te.next() != null) {
> + if (VERBOSE) {
> + System.out.println(" got term=" + te.term() + " docFreq=" +
> te.docFreq());
> + }
> + verifier.sawTerm(te.term());
> + postingsEnum = te.postings(null, postingsEnum);
> + int docID;
> + while ((docID = postingsEnum.nextDoc()) !=
> PostingsEnum.NO_MORE_DOCS) {
> + long v = docValues.get(docID);
> + assert v >= min && v <= max: "docID=" + docID + " v=" + v;
> + verifier.sawDoc(docID);
> + }
> + }
> +
> + int startLoc = Collections.binarySearch(sortedTerms, min);
> + if (startLoc < 0) {
> + startLoc = -startLoc-1;
> + } else if (minInclusive == false) {
> + startLoc++;
> + }
> + int endLoc = Collections.binarySearch(sortedTerms, max);
> + if (endLoc < 0) {
> + endLoc = -endLoc-2;
> + } else if (maxInclusive == false) {
> + endLoc--;
> + }
> + int expectedHits = endLoc-startLoc+1;
> + try {
> + verifier.finish(expectedHits, maxTermsAutoPrefix);
> + } catch (AssertionError ae) {
> + for(int i=0;i<numTerms;i++) {
> + if (verifier.allHits.get(i) == false) {
> + int v = (int) docValues.get(i);
> + boolean accept = (v > min || (v == min && minInclusive)) &&
> + (v < max || (v == max && maxInclusive));
> + if (accept) {
> + System.out.println("MISSING: docID=" + i + " v=" + v + "
> term=" + intToBytes(v));
> + }
> + }
> + }
> +
> + throw ae;
> + }
> + }
> +
> + r.close();
> + w.close();
> + dir.close();
> + }
> +
> + // Non-numeric, simple prefix query
> + public void testBasicPrefixTerms() throws Exception {
> + Directory dir = newDirectory();
> + IndexWriterConfig iwc = new IndexWriterConfig(new
> MockAnalyzer(random()));
> + iwc.setCodec(codec);
> + iwc.setMergeScheduler(new SerialMergeScheduler());
> + IndexWriter w = new IndexWriter(dir, iwc);
> + int numTerms = TestUtil.nextInt(random(), 3000, 50000);
> + Set<String> terms = new HashSet<>();
> + while (terms.size() < numTerms) {
> + terms.add(TestUtil.randomSimpleString(random()));
> + }
> +
> + for(String term : terms) {
> + Document doc = new Document();
> + doc.add(new StringField("field", term, Field.Store.NO));
> + doc.add(new BinaryDocValuesField("field", new BytesRef(term)));
> + w.addDocument(doc);
> + }
> +
> + if (random().nextBoolean()) {
> + if (VERBOSE) {
> + System.out.println("TEST: now force merge");
> + }
> + w.forceMerge(1);
> + }
> +
> + IndexReader r = DirectoryReader.open(w, true);
> +
> + List<String> sortedTerms = new ArrayList<>(terms);
> + Collections.sort(sortedTerms);
> +
> + if (VERBOSE) {
> + System.out.println("TEST: sorted terms:");
> + int idx = 0;
> + for(String term : sortedTerms) {
> + System.out.println(idx + ": " + term);
> + idx++;
> + }
> + }
> +
> + if (VERBOSE) {
> + System.out.println("TEST: r=" + r);
> + }
> +
> + int iters = atLeast(100);
> + for(int iter=0;iter<iters;iter++) {
> + if (VERBOSE) {
> + System.out.println("\nTEST: iter=" + iter);
> + }
> +
> + String prefix;
> + if (random().nextInt(100) == 42) {
> + prefix = "";
> + } else {
> + prefix = TestUtil.randomSimpleString(random(), 1, 4);
> + }
> + BytesRef prefixBR = new BytesRef(prefix);
> + if (VERBOSE) {
> + System.out.println(" prefix=" + prefix);
> + }
> +
> + CompiledAutomaton ca = new
> CompiledAutomaton(PrefixQuery.toAutomaton(prefixBR), true, false,
> Integer.MAX_VALUE, true);
> + TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field"));
> + BinaryDocValues docValues = MultiDocValues.getBinaryValues(r, "field");
> + PostingsEnum postingsEnum = null;
> +
> + VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(),
> prefixBR);
> +
> + while (te.next() != null) {
> + if (VERBOSE) {
> + System.out.println("TEST: got term=" + te.term().utf8ToString() +
> " docFreq=" + te.docFreq());
> + }
> + verifier.sawTerm(te.term());
> + postingsEnum = te.postings(null, postingsEnum);
> + int docID;
> + while ((docID = postingsEnum.nextDoc()) !=
> PostingsEnum.NO_MORE_DOCS) {
> + assertTrue("prefixBR=" + prefixBR + " docBR=" +
> docValues.get(docID), StringHelper.startsWith(docValues.get(docID),
> prefixBR));
> + // The auto-prefix terms should never "overlap" one another, so we
> should only ever see a given docID one time:
> + verifier.sawDoc(docID);
> + }
> + }
> +
> + int startLoc = Collections.binarySearch(sortedTerms, prefix);
> + if (startLoc < 0) {
> + startLoc = -startLoc-1;
> + }
> + int endLoc = Collections.binarySearch(sortedTerms, prefix + (char)
> ('z'+1));
> + if (endLoc < 0) {
> + endLoc = -endLoc-2;
> + }
> + int expectedHits = endLoc-startLoc+1;
> + try {
> + verifier.finish(expectedHits, maxTermsAutoPrefix);
> + } catch (AssertionError ae) {
> + for(int i=0;i<numTerms;i++) {
> + if (verifier.allHits.get(i) == false) {
> + String s = docValues.get(i).utf8ToString();
> + if (s.startsWith(prefix)) {
> + System.out.println("MISSING: docID=" + i + " term=" + s);
> + }
> + }
> + }
> +
> + throw ae;
> + }
> + }
> +
> + r.close();
> + w.close();
> + dir.close();
> + }
> +
> + public void testDemoPrefixTerms() throws Exception {
> + if (VERBOSE) {
> + System.out.println("\nTEST: minTermsAutoPrefix=" + minTermsAutoPrefix
> + " maxTermsAutoPrefix=" + maxTermsAutoPrefix);
> + System.out.println("\nTEST: minItemsPerBlock=" + minItemsPerBlock + "
> maxItemsPerBlock=" + maxItemsPerBlock);
> + }
> + Directory dir = newDirectory();
> + IndexWriterConfig iwc = new IndexWriterConfig(new
> MockAnalyzer(random()));
> + iwc.setCodec(codec);
> + IndexWriter w = new IndexWriter(dir, iwc);
> + int numDocs = 30;
> +
> + for(int i=0;i<numDocs;i++) {
> + Document doc = new Document();
> + doc.add(new StringField("field", "" + (char) (97+i), Field.Store.NO));
> + w.addDocument(doc);
> + doc = new Document();
> + doc.add(new StringField("field", "a" + (char) (97+i), Field.Store.NO));
> + w.addDocument(doc);
> + }
> +
> + if (random().nextBoolean()) {
> + w.forceMerge(1);
> + }
> +
> + IndexReader r = DirectoryReader.open(w, true);
> + Terms terms = MultiFields.getTerms(r, "field");
> + if (VERBOSE) {
> + System.out.println("\nTEST: now intersect");
> + }
> + CompiledAutomaton ca = new CompiledAutomaton(PrefixQuery.toAutomaton(new
> BytesRef("a")), false, false, Integer.MAX_VALUE, true);
> + TermsEnum te = ca.getTermsEnum(terms);
> + PostingsEnum postingsEnum = null;
> +
> + VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(),
> new BytesRef("a"));
> + //TermsEnum te = terms.intersect(new CompiledAutomaton(a, true, false),
> null);
> + while (te.next() != null) {
> + verifier.sawTerm(te.term());
> + postingsEnum = te.postings(null, postingsEnum);
> + int docID;
> + while ((docID = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
> + // The auto-prefix terms should never "overlap" one another, so we
> should only ever see a given docID one time:
> + verifier.sawDoc(docID);
> + }
> + }
> + // 1 document has exactly "a", and 30 documents had "a?"
> + verifier.finish(31, maxTermsAutoPrefix);
> + PrefixQuery q = new PrefixQuery(new Term("field", "a"));
> + q.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE);
> + assertEquals(31, newSearcher(r).search(q, 1).totalHits);
> + r.close();
> + w.close();
> + dir.close();
> + }
> +
> + static final class BinaryTokenStream extends TokenStream {
> + private final ByteTermAttribute bytesAtt =
> addAttribute(ByteTermAttribute.class);
> + private boolean available = true;
> +
> + public BinaryTokenStream(BytesRef bytes) {
> + bytesAtt.setBytesRef(bytes);
> + }
> +
> + @Override
> + public boolean incrementToken() {
> + if (available) {
> + clearAttributes();
> + available = false;
> + return true;
> + }
> + return false;
> + }
> +
> + @Override
> + public void reset() {
> + available = true;
> + }
> +
> + public interface ByteTermAttribute extends TermToBytesRefAttribute {
> + void setBytesRef(BytesRef bytes);
> + }
> +
> + public static class ByteTermAttributeImpl extends AttributeImpl
> implements ByteTermAttribute,TermToBytesRefAttribute {
> + private BytesRef bytes;
> +
> + @Override
> + public void fillBytesRef() {
> + // no-op: the bytes was already filled by our owner's incrementToken
> + }
> +
> + @Override
> + public BytesRef getBytesRef() {
> + return bytes;
> + }
> +
> + @Override
> + public void setBytesRef(BytesRef bytes) {
> + this.bytes = bytes;
> + }
> +
> + @Override
> + public void clear() {}
> +
> + @Override
> + public void copyTo(AttributeImpl target) {
> + ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
> + other.bytes = bytes;
> + }
> + }
> + }
> +
> + /** Basically a StringField that accepts binary term. */
> + private static class BinaryField extends Field {
> +
> + final static FieldType TYPE;
> + static {
> + TYPE = new FieldType(StringField.TYPE_NOT_STORED);
> + // Necessary so our custom tokenStream is used by Field.tokenStream:
> + TYPE.setTokenized(true);
> + TYPE.freeze();
> + }
> +
> + public BinaryField(String name, BytesRef value) {
> + super(name, new BinaryTokenStream(value), TYPE);
> + }
> + }
> +
> + /** Helper class to ensure auto-prefix terms 1) never overlap one another,
> and 2) are used when they should be. */
> + private static class VerifyAutoPrefixTerms {
> + final FixedBitSet allHits;
> + private final Map<BytesRef,Integer> prefixCounts = new HashMap<>();
> + private int totPrefixCount;
> + private final BytesRef[] bounds;
> + private int totTermCount;
> + private BytesRef lastTerm;
> +
> + public VerifyAutoPrefixTerms(int maxDoc, BytesRef... bounds) {
> + allHits = new FixedBitSet(maxDoc);
> + assert bounds.length > 0;
> + this.bounds = bounds;
> + }
> +
> + public void sawTerm(BytesRef term) {
> + //System.out.println("saw term=" + term);
> + if (lastTerm != null) {
> + assertTrue(lastTerm.compareTo(term) < 0);
> + }
> + lastTerm = BytesRef.deepCopyOf(term);
> + totTermCount++;
> + totPrefixCount += term.length;
> + for(int i=1;i<=term.length;i++) {
> + BytesRef prefix = BytesRef.deepCopyOf(term);
> + prefix.length = i;
> + Integer count = prefixCounts.get(prefix);
> + if (count == null) {
> + count = 1;
> + } else {
> + count += 1;
> + }
> + prefixCounts.put(prefix, count);
> + }
> + }
> +
> + public void sawDoc(int docID) {
> + // The auto-prefix terms should never "overlap" one another, so we
> should only ever see a given docID one time:
> + assertFalse(allHits.getAndSet(docID));
> + }
> +
> + public void finish(int expectedNumHits, int maxPrefixCount) {
> +
> + if (maxPrefixCount != -1) {
> + // Auto-terms were used in this test
> + long allowedMaxTerms;
> +
> + if (bounds.length == 1) {
> + // Simple prefix query: we should never see more than
> maxPrefixCount terms:
> + allowedMaxTerms = maxPrefixCount;
> + } else {
> + // Trickier: we need to allow for maxPrefixTerms for each
> different leading byte in the min and max:
> + assert bounds.length == 2;
> + BytesRef minTerm = bounds[0];
> + BytesRef maxTerm = bounds[1];
> +
> + int commonPrefix = 0;
> + for(int i=0;i<minTerm.length && i<maxTerm.length;i++) {
> + if (minTerm.bytes[minTerm.offset+i] !=
> maxTerm.bytes[maxTerm.offset+i]) {
> + commonPrefix = i;
> + break;
> + }
> + }
> +
> + allowedMaxTerms = maxPrefixCount * (long)
> ((minTerm.length-commonPrefix) + (maxTerm.length-commonPrefix));
> + }
> +
> + assertTrue("totTermCount=" + totTermCount + " is > allowedMaxTerms="
> + allowedMaxTerms, totTermCount <= allowedMaxTerms);
> + }
> +
> + assertEquals(expectedNumHits, allHits.cardinality());
> + int sum = 0;
> + for(Map.Entry<BytesRef,Integer> ent : prefixCounts.entrySet()) {
> +
> + BytesRef prefix = ent.getKey();
> + if (VERBOSE) {
> + System.out.println(" verify prefix=" +
> TestUtil.bytesRefToString(prefix) + " count=" + ent.getValue());
> + }
> +
> + if (maxPrefixCount != -1) {
> + // Auto-terms were used in this test
> +
> + int sumLeftoverSuffix = 0;
> + for(BytesRef bound : bounds) {
> +
> + int minSharedLength = Math.min(bound.length, prefix.length);
> + int commonPrefix = minSharedLength;
> + for(int i=0;i<minSharedLength;i++) {
> + if (bound.bytes[bound.offset+i] !=
> prefix.bytes[prefix.offset+i]) {
> + commonPrefix = i;
> + break;
> + }
> + }
> + sumLeftoverSuffix += bound.length - commonPrefix;
> + }
> +
> + long limit = (1+sumLeftoverSuffix) * (long) maxPrefixCount;
> +
> + assertTrue("maxPrefixCount=" + maxPrefixCount + " prefix=" +
> prefix + " sumLeftoverSuffix=" + sumLeftoverSuffix + " limit=" + limit + " vs
> actual=" +ent.getValue(),
> + ent.getValue() <= limit);
> + }
> +
> + sum += ent.getValue();
> + }
> +
> + // Make sure no test bug:
> + assertEquals(totPrefixCount, sum);
> + }
> + }
> +
> + /** Make sure you get clear exc. if you try to use this within anything
> but IndexOptions.DOCS fields. */
> + public void testWithFreqs() throws Exception {
> + Directory dir = newDirectory();
> + IndexWriterConfig iwc = new IndexWriterConfig(new
> MockAnalyzer(random()));
> + iwc.setCodec(codec);
> + IndexWriter w = new IndexWriter(dir, iwc);
> + FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
> + ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
> + Document doc = new Document();
> + doc.add(new Field("foo", "bar bar", ft));
> + w.addDocument(doc);
> + try {
> + w.commit();
> + } catch (IllegalStateException ise) {
> + assertEquals("ranges can only be indexed with IndexOptions.DOCS
> (field: foo)", ise.getMessage());
> + }
> + w.close();
> + dir.close();
> + }
> +
> + /** Make sure you get clear exc. if you try to use this within anything
> but IndexOptions.DOCS fields. */
> + public void testWithPositions() throws Exception {
> + Directory dir = newDirectory();
> + IndexWriterConfig iwc = new IndexWriterConfig(new
> MockAnalyzer(random()));
> + iwc.setCodec(codec);
> + IndexWriter w = new IndexWriter(dir, iwc);
> + FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
> + ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
> + Document doc = new Document();
> + doc.add(new Field("foo", "bar bar", ft));
> + w.addDocument(doc);
> + try {
> + w.commit();
> + } catch (IllegalStateException ise) {
> + assertEquals("ranges can only be indexed with IndexOptions.DOCS
> (field: foo)", ise.getMessage());
> + }
> + w.close();
> + dir.close();
> + }
> +
> + /** Make sure you get clear exc. if you try to use this within anything
> but IndexOptions.DOCS fields. */
> + public void testWithOffsets() throws Exception {
> + Directory dir = newDirectory();
> + IndexWriterConfig iwc = new IndexWriterConfig(new
> MockAnalyzer(random()));
> + iwc.setCodec(codec);
> + IndexWriter w = new IndexWriter(dir, iwc);
> + FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
> +
> ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
> + Document doc = new Document();
> + doc.add(new Field("foo", "bar bar", ft));
> + w.addDocument(doc);
> + try {
> + w.commit();
> + } catch (IllegalStateException ise) {
> + assertEquals("ranges can only be indexed with IndexOptions.DOCS
> (field: foo)", ise.getMessage());
> + }
> + w.close();
> + dir.close();
> + }
> +}
>
> Modified:
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java?rev=1670918&r1=1670917&r2=1670918&view=diff
> ==============================================================================
> ---
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java
> (original)
> +++
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java
> Thu Apr 2 15:05:48 2015
> @@ -16,6 +16,7 @@ package org.apache.lucene.codecs;
> * limitations under the License.
> */
>
> +import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader; // javadocs
> import org.apache.lucene.index.OrdTermState;
> import org.apache.lucene.index.TermState;
>
> @@ -23,6 +24,8 @@ import org.apache.lucene.index.TermState
> * Holds all state required for {@link PostingsReaderBase}
> * to produce a {@link org.apache.lucene.index.PostingsEnum} without
> re-seeking the
> * terms dict.
> + *
> + * @lucene.internal
> */
> public class BlockTermState extends OrdTermState {
> /** how many docs have this term */
> @@ -36,6 +39,11 @@ public class BlockTermState extends OrdT
> // TODO: update BTR to nuke this
> public long blockFilePointer;
>
> + /** True if this term is "real" (e.g., not an auto-prefix term or
> + * some other "secret" term; currently only {@link BlockTreeTermsReader}
> + * sets this). */
> + public boolean isRealTerm;
> +
> /** Sole constructor. (For invocation by subclass
> * constructors, typically implicit.) */
> protected BlockTermState() {
> @@ -50,10 +58,11 @@ public class BlockTermState extends OrdT
> totalTermFreq = other.totalTermFreq;
> termBlockOrd = other.termBlockOrd;
> blockFilePointer = other.blockFilePointer;
> + isRealTerm = other.isRealTerm;
> }
>
> @Override
> public String toString() {
> - return "docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + "
> termBlockOrd=" + termBlockOrd + " blockFP=" + blockFilePointer;
> + return "docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + "
> termBlockOrd=" + termBlockOrd + " blockFP=" + blockFilePointer + "
> isRealTerm=" + isRealTerm;
> }
> }
>
> Modified:
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java?rev=1670918&r1=1670917&r2=1670918&view=diff
> ==============================================================================
> ---
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java
> (original)
> +++
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java
> Thu Apr 2 15:05:48 2015
> @@ -62,6 +62,7 @@ public abstract class PostingsFormat imp
> * @param name must be all ascii alphanumeric, and less than 128
> characters in length.
> */
> protected PostingsFormat(String name) {
> + // TODO: can we somehow detect name conflicts here? Two different
> classes trying to claim the same name? Otherwise you see confusing errors...
> NamedSPILoader.checkServiceName(name);
> this.name = name;
> }
>
> Added:
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java?rev=1670918&view=auto
> ==============================================================================
> ---
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java
> (added)
> +++
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java
> Thu Apr 2 15:05:48 2015
> @@ -0,0 +1,415 @@
> +package org.apache.lucene.codecs.blocktree;
> +
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +import java.io.IOException;
> +import java.util.ArrayList;
> +import java.util.Collections;
> +import java.util.List;
> +
> +import org.apache.lucene.index.FilteredTermsEnum;
> +import org.apache.lucene.index.Terms;
> +import org.apache.lucene.index.TermsEnum;
> +import org.apache.lucene.util.ArrayUtil;
> +import org.apache.lucene.util.BytesRef;
> +import org.apache.lucene.util.BytesRefBuilder;
> +import org.apache.lucene.util.StringHelper;
> +
> +// TODO: instead of inlining auto-prefix terms with normal terms,
> +// we could write them into their own virtual/private field. This
> +// would make search time a bit more complex, since we'd need to
> +// merge sort between two TermEnums, but it would also make stats
> +// API (used by CheckIndex -verbose) easier to implement since we could
> +// just walk this virtual field and gather its stats)
> +
> +/** Used in the first pass when writing a segment to locate
> + * "appropriate" auto-prefix terms to pre-compile into the index.
> + * This visits every term in the index to find prefixes that
> + * match >= min and <= max number of terms. */
> +
> +class AutoPrefixTermsWriter {
> +
> + //static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
> + //static boolean DEBUG = false;
> + //static boolean DEBUG2 = BlockTreeTermsWriter.DEBUG2;
> + //static boolean DEBUG2 = true;
> +
> + /** Describes a range of term-space to match, either a simple prefix
> + * (foo*) or a floor-block range of a prefix (e.g. foo[a-m]*,
> + * foo[n-z]*) when there are too many terms starting with foo*. */
> + public static final class PrefixTerm implements Comparable<PrefixTerm> {
> + /** Common prefix */
> + public final byte[] prefix;
> +
> + /** If this is -2, this is a normal prefix (foo *), else it's the
> minimum lead byte of the suffix (e.g. 'd' in foo[d-m]*). */
> + public final int floorLeadStart;
> +
> + /** The lead byte (inclusive) of the suffix for the term range we match
> (e.g. 'm' in foo[d-m*]); this is ignored when
> + * floorLeadStart is -2. */
> + public final int floorLeadEnd;
> +
> + public final BytesRef term;
> +
> + /** Sole constructor. */
> + public PrefixTerm(byte[] prefix, int floorLeadStart, int floorLeadEnd) {
> + this.prefix = prefix;
> + this.floorLeadStart = floorLeadStart;
> + this.floorLeadEnd = floorLeadEnd;
> + this.term = toBytesRef(prefix, floorLeadStart);
> +
> + assert floorLeadEnd >= floorLeadStart;
> + assert floorLeadEnd >= 0;
> + assert floorLeadStart == -2 || floorLeadStart >= 0;
> +
> + // We should never create empty-string prefix term:
> + assert prefix.length > 0 || floorLeadStart != -2 || floorLeadEnd !=
> 0xff;
> + }
> +
> + @Override
> + public String toString() {
> + String s = brToString(new BytesRef(prefix));
> + if (floorLeadStart == -2) {
> + s += "[-" + Integer.toHexString(floorLeadEnd) + "]";
> + } else {
> + s += "[" + Integer.toHexString(floorLeadStart) + "-" +
> Integer.toHexString(floorLeadEnd) + "]";
> + }
> + return s;
> + }
> +
> + @Override
> + public int compareTo(PrefixTerm other) {
> + int cmp = term.compareTo(other.term);
> + if (cmp == 0) {
> + if (prefix.length != other.prefix.length) {
> + return prefix.length - other.prefix.length;
> + }
> +
> + // On tie, sort the bigger floorLeadEnd, earlier, since it
> + // spans more terms, so during intersect, we want to encounter this
> one
> + // first so we can use it if the automaton accepts the larger range:
> + cmp = other.floorLeadEnd - floorLeadEnd;
> + }
> +
> + return cmp;
> + }
> +
> + /** Returns the leading term for this prefix term, e.g. "foo" (for
> + * the foo* prefix) or "foom" (for the foo[m-z]* case). */
> + private static BytesRef toBytesRef(byte[] prefix, int floorLeadStart) {
> + BytesRef br;
> + if (floorLeadStart != -2) {
> + assert floorLeadStart >= 0;
> + br = new BytesRef(prefix.length+1);
> + } else {
> + br = new BytesRef(prefix.length);
> + }
> + System.arraycopy(prefix, 0, br.bytes, 0, prefix.length);
> + br.length = prefix.length;
> + if (floorLeadStart != -2) {
> + assert floorLeadStart >= 0;
> + br.bytes[br.length++] = (byte) floorLeadStart;
> + }
> +
> + return br;
> + }
> +
> + public int compareTo(BytesRef term) {
> + return this.term.compareTo(term);
> + }
> +
> + public TermsEnum getTermsEnum(TermsEnum in) {
> +
> + final BytesRef prefixRef = new BytesRef(prefix);
> +
> + return new FilteredTermsEnum(in) {
> + {
> + setInitialSeekTerm(term);
> + }
> +
> + @Override
> + protected AcceptStatus accept(BytesRef term) {
> + if (StringHelper.startsWith(term, prefixRef) &&
> + (floorLeadEnd == -1 || term.length == prefixRef.length ||
> (term.bytes[term.offset + prefixRef.length] & 0xff) <= floorLeadEnd)) {
> + return AcceptStatus.YES;
> + } else {
> + return AcceptStatus.END;
> + }
> + }
> + };
> + }
> + }
> +
> + // for debugging
> + static String brToString(BytesRef b) {
> + try {
> + return b.utf8ToString() + " " + b;
> + } catch (Throwable t) {
> + // If BytesRef isn't actually UTF8, or it's eg a
> + // prefix of UTF8 that ends mid-unicode-char, we
> + // fallback to hex:
> + return b.toString();
> + }
> + }
> +
> + final List<PrefixTerm> prefixes = new ArrayList<>();
> + private final int minItemsInPrefix;
> + private final int maxItemsInPrefix;
> +
> + // Records index into pending where the current prefix at that
> + // length "started"; for example, if current term starts with 't',
> + // startsByPrefix[0] is the index into pending for the first
> + // term/sub-block starting with 't'. We use this to figure out when
> + // to write a new block:
> + private final BytesRefBuilder lastTerm = new BytesRefBuilder();
> + private int[] prefixStarts = new int[8];
> + private List<Object> pending = new ArrayList<>();
> +
> + //private final String segment;
> +
> + public AutoPrefixTermsWriter(Terms terms, int minItemsInPrefix, int
> maxItemsInPrefix) throws IOException {
> + this.minItemsInPrefix = minItemsInPrefix;
> + this.maxItemsInPrefix = maxItemsInPrefix;
> + //this.segment = segment;
> +
> + TermsEnum termsEnum = terms.iterator(null);
> + while (true) {
> + BytesRef term = termsEnum.next();
> + if (term == null) {
> + break;
> + }
> + //if (DEBUG) System.out.println("pushTerm: " + brToString(term));
> + pushTerm(term);
> + }
> +
> + if (pending.size() > 1) {
> + pushTerm(BlockTreeTermsWriter.EMPTY_BYTES_REF);
> +
> + // Also maybe save floor prefixes in root block; this can be a biggish
> perf gain for large ranges:
> + /*
> + System.out.println("root block pending.size=" + pending.size());
> + for(Object o : pending) {
> + System.out.println(" " + o);
> + }
> + */
> + while (pending.size() >= minItemsInPrefix) {
> + savePrefixes(0, pending.size());
> + }
> + }
> +
> + Collections.sort(prefixes);
> + }
> +
> + /** Pushes the new term to the top of the stack, and writes new blocks. */
> + private void pushTerm(BytesRef text) throws IOException {
> + int limit = Math.min(lastTerm.length(), text.length);
> +
> + // Find common prefix between last term and current term:
> + int pos = 0;
> + while (pos < limit && lastTerm.byteAt(pos) ==
> text.bytes[text.offset+pos]) {
> + pos++;
> + }
> +
> + //if (DEBUG) System.out.println(" shared=" + pos + " lastTerm.length="
> + lastTerm.length());
> +
> + // Close the "abandoned" suffix now:
> + for(int i=lastTerm.length()-1;i>=pos;i--) {
> +
> + // How many items on top of the stack share the current suffix
> + // we are closing:
> + int prefixTopSize = pending.size() - prefixStarts[i];
> +
> + while (prefixTopSize >= minItemsInPrefix) {
> + //if (DEBUG) System.out.println("pushTerm i=" + i + "
> prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix);
> + savePrefixes(i+1, prefixTopSize);
> + //prefixStarts[i] -= prefixTopSize;
> + //System.out.println(" after savePrefixes: " + (pending.size() -
> prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" +
> prefixStarts[i]);
> +
> + // For large floor blocks, it's possible we should now re-run on the
> new prefix terms we just created:
> + prefixTopSize = pending.size() - prefixStarts[i];
> + }
> + }
> +
> + if (prefixStarts.length < text.length) {
> + prefixStarts = ArrayUtil.grow(prefixStarts, text.length);
> + }
> +
> + // Init new tail:
> + for(int i=pos;i<text.length;i++) {
> + prefixStarts[i] = pending.size();
> + }
> +
> + lastTerm.copyBytes(text);
> +
> + // Only append the first (optional) empty string, no the fake last one
> used to close all prefixes:
> + if (text.length > 0 || pending.isEmpty()) {
> + byte[] termBytes = new byte[text.length];
> + System.arraycopy(text.bytes, text.offset, termBytes, 0, text.length);
> + pending.add(termBytes);
> + }
> + }
> +
> + void savePrefixes(int prefixLength, int count) throws IOException {
> +
> + assert count > 0;
> +
> + //if (DEBUG2) {
> + // BytesRef br = new BytesRef(lastTerm.bytes());
> + // br.length = prefixLength;
> + // System.out.println(" savePrefixes: seg=" + segment + " " +
> brToString(br) + " count=" + count + " pending.size()=" + pending.size());
> + //}
> +
> + int lastSuffixLeadLabel = -2;
> +
> + int start = pending.size()-count;
> + assert start >=0;
> +
> + int end = pending.size();
> + int nextBlockStart = start;
> + int nextFloorLeadLabel = -1;
> + int prefixCount = 0;
> + int pendingCount = 0;
> + PrefixTerm lastPTEntry = null;
> + for (int i=start; i<end; i++) {
> +
> + byte[] termBytes;
> + Object o = pending.get(i);
> + PrefixTerm ptEntry;
> + if (o instanceof byte[]) {
> + ptEntry = null;
> + termBytes = (byte[]) o;
> + } else {
> + ptEntry = (PrefixTerm) o;
> + termBytes = ptEntry.term.bytes;
> + if (ptEntry.prefix.length != prefixLength) {
> + assert ptEntry.prefix.length > prefixLength;
> + ptEntry = null;
> + }
> + }
> + pendingCount++;
> +
> + //if (DEBUG) System.out.println(" check term=" + brToString(new
> BytesRef(termBytes)));
> +
> + int suffixLeadLabel;
> +
> + if (termBytes.length == prefixLength) {
> + // Suffix is 0, i.e. prefix 'foo' and term is
> + // 'foo' so the term has empty string suffix
> + // in this block
> + assert lastSuffixLeadLabel == -2;
> + suffixLeadLabel = -2;
> + } else {
> + suffixLeadLabel = termBytes[prefixLength] & 0xff;
> + }
> +
> + // if (DEBUG) System.out.println(" i=" + i + " ent=" + ent + "
> suffixLeadLabel=" + suffixLeadLabel);
> +
> + if (suffixLeadLabel != lastSuffixLeadLabel) {
> + // This is a boundary, a chance to make an auto-prefix term if we
> want:
> +
> + // When we are "recursing" (generating auto-prefix terms on a block
> of
> + // floor'd auto-prefix terms), this assert is non-trivial because it
> + // ensures the floorLeadEnd of the previous terms is in fact less
> + // than the lead start of the current entry:
> + assert suffixLeadLabel > lastSuffixLeadLabel: "suffixLeadLabel=" +
> suffixLeadLabel + " vs lastSuffixLeadLabel=" + lastSuffixLeadLabel;
> +
> + // NOTE: must check nextFloorLeadLabel in case minItemsInPrefix is 2
> and prefix is 'a' and we've seen 'a' and then 'aa'
> + if (pendingCount >= minItemsInPrefix && end-nextBlockStart >
> maxItemsInPrefix && nextFloorLeadLabel != -1) {
> + // The count is too large for one block, so we must break it into
> "floor" blocks, where we record
> + // the leading label of the suffix of the first term in each floor
> block, so at search time we can
> + // jump to the right floor block. We just use a naive greedy
> segmenter here: make a new floor
> + // block as soon as we have at least minItemsInBlock. This is not
> always best: it often produces
> + // a too-small block as the final block:
> +
> + // If the last entry was another prefix term of the same length,
> then it represents a range of terms, so we must use its ending
> + // prefix label as our ending label:
> + if (lastPTEntry != null) {
> + lastSuffixLeadLabel = lastPTEntry.floorLeadEnd;
> + }
> +
> + savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel);
> + pendingCount = 0;
> +
> + prefixCount++;
> + nextFloorLeadLabel = suffixLeadLabel;
> + nextBlockStart = i;
> + }
> +
> + if (nextFloorLeadLabel == -1) {
> + nextFloorLeadLabel = suffixLeadLabel;
> + //if (DEBUG) System.out.println("set first lead label=" +
> nextFloorLeadLabel);
> + }
> +
> + lastSuffixLeadLabel = suffixLeadLabel;
> + }
> + lastPTEntry = ptEntry;
> + }
> +
> + // Write last block, if any:
> + if (nextBlockStart < end) {
> + //System.out.println(" lastPTEntry=" + lastPTEntry + "
> lastSuffixLeadLabel=" + lastSuffixLeadLabel);
> + if (lastPTEntry != null) {
> + lastSuffixLeadLabel = lastPTEntry.floorLeadEnd;
> + }
> + assert lastSuffixLeadLabel >= nextFloorLeadLabel:
> "lastSuffixLeadLabel=" + lastSuffixLeadLabel + " nextFloorLeadLabel=" +
> nextFloorLeadLabel;
> + if (prefixCount == 0) {
> + if (prefixLength > 0) {
> + savePrefix(prefixLength, -2, 0xff);
> + prefixCount++;
> + } else {
> + // Don't add a prefix term for all terms in the index!
> + }
> + } else {
> + if (lastSuffixLeadLabel == -2) {
> + // Special case when closing the empty string root block:
> + lastSuffixLeadLabel = 0xff;
> + }
> + savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel);
> + prefixCount++;
> + }
> + }
> +
> + // Remove slice from the top of the pending stack, that we just wrote:
> + int sizeToClear = count;
> + if (prefixCount > 1) {
> + Object o = pending.get(pending.size()-count);
> + if (o instanceof byte[] && ((byte[]) o).length == prefixLength) {
> + // If we were just asked to write all f* terms, but there were too
> many and so we made floor blocks, the exact term 'f' will remain
> + // as its own item, followed by floor block terms like f[a-m]*,
> f[n-z]*, so in this case we leave 3 (not 2) items on the pending stack:
> + sizeToClear--;
> + }
> + }
> + pending.subList(pending.size()-sizeToClear, pending.size()).clear();
> +
> + // Append prefix terms for each prefix, since these count like real
> terms that also need to be "rolled up":
> + for(int i=0;i<prefixCount;i++) {
> + PrefixTerm pt = prefixes.get(prefixes.size()-(prefixCount-i));
> + pending.add(pt);
> + }
> + }
> +
> + private void savePrefix(int prefixLength, int floorLeadStart, int
> floorLeadEnd) {
> + byte[] prefix = new byte[prefixLength];
> + System.arraycopy(lastTerm.bytes(), 0, prefix, 0, prefixLength);
> + assert floorLeadStart != -1;
> + assert floorLeadEnd != -1;
> +
> + PrefixTerm pt = new PrefixTerm(prefix, floorLeadStart, floorLeadEnd);
> + //if (DEBUG2) System.out.println(" savePrefix: seg=" + segment + " "
> + pt + " count=" + count);
> + prefixes.add(pt);
> + }
> +}
>
> Added:
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetPostingsEnum.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetPostingsEnum.java?rev=1670918&view=auto
> ==============================================================================
> ---
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetPostingsEnum.java
> (added)
> +++
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetPostingsEnum.java
> Thu Apr 2 15:05:48 2015
> @@ -0,0 +1,95 @@
> +package org.apache.lucene.codecs.blocktree;
> +
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +import java.io.IOException;
> +
> +import org.apache.lucene.index.PostingsEnum;
> +import org.apache.lucene.search.DocIdSetIterator;
> +import org.apache.lucene.util.BitSet;
> +import org.apache.lucene.util.BitSetIterator;
> +import org.apache.lucene.util.BytesRef;
> +import org.apache.lucene.util.FixedBitSet; // javadocs
> +
> +/** Takes a {@link FixedBitSet} and creates a DOCS {@link PostingsEnum} from
> it. */
> +
> +class BitSetPostingsEnum extends PostingsEnum {
> + private final BitSet bits;
> + private DocIdSetIterator in;
> +
> + BitSetPostingsEnum(BitSet bits) {
> + this.bits = bits;
> + reset();
> + }
> +
> + @Override
> + public int freq() throws IOException {
> + return 1;
> + }
> +
> + @Override
> + public int docID() {
> + if (in == null) {
> + return -1;
> + } else {
> + return in.docID();
> + }
> + }
> +
> + @Override
> + public int nextDoc() throws IOException {
> + if (in == null) {
> + in = new BitSetIterator(bits, 0);
> + }
> + return in.nextDoc();
> + }
> +
> + @Override
> + public int advance(int target) throws IOException {
> + return in.advance(target);
> + }
> +
> + @Override
> + public long cost() {
> + return in.cost();
> + }
> +
> + void reset() {
> + in = null;
> + }
> +
> + @Override
> + public BytesRef getPayload() {
> + return null;
> + }
> +
> + @Override
> + public int nextPosition() {
> + throw new UnsupportedOperationException();
> + }
> +
> + @Override
> + public int startOffset() {
> + throw new UnsupportedOperationException();
> + }
> +
> + @Override
> + public int endOffset() {
> + throw new UnsupportedOperationException();
> + }
> +}
>
> Added:
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetTermsEnum.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetTermsEnum.java?rev=1670918&view=auto
> ==============================================================================
> ---
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetTermsEnum.java
> (added)
> +++
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BitSetTermsEnum.java
> Thu Apr 2 15:05:48 2015
> @@ -0,0 +1,87 @@
> +package org.apache.lucene.codecs.blocktree;
> +
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +import org.apache.lucene.codecs.PostingsWriterBase;
> +import org.apache.lucene.index.PostingsEnum;
> +import org.apache.lucene.index.TermsEnum;
> +import org.apache.lucene.util.BitSet;
> +import org.apache.lucene.util.Bits;
> +import org.apache.lucene.util.BytesRef;
> +
> +/** Silly stub class, used only when writing an auto-prefix
> + * term in order to expose DocsEnum over a FixedBitSet. We
> + * pass this to {@link PostingsWriterBase#writeTerm} so
> + * that it can pull .docs() multiple times for the
> + * current term. */
> +
> +class BitSetTermsEnum extends TermsEnum {
> + private final BitSetPostingsEnum postingsEnum;
> +
> + public BitSetTermsEnum(BitSet docs) {
> + postingsEnum = new BitSetPostingsEnum(docs);
> + }
> +
> + @Override
> + public SeekStatus seekCeil(BytesRef text) {
> + throw new UnsupportedOperationException();
> + }
> +
> + @Override
> + public void seekExact(long ord) {
> + throw new UnsupportedOperationException();
> + }
> +
> + @Override
> + public BytesRef term() {
> + throw new UnsupportedOperationException();
> + }
> +
> + @Override
> + public BytesRef next() {
> + throw new UnsupportedOperationException();
> + }
> +
> + @Override
> + public long ord() {
> + throw new UnsupportedOperationException();
> + }
> +
> + @Override
> + public int docFreq() {
> + throw new UnsupportedOperationException();
> + }
> +
> + @Override
> + public long totalTermFreq() {
> + throw new UnsupportedOperationException();
> + }
> +
> + @Override
> + public PostingsEnum postings(Bits liveDocs, PostingsEnum reuse, int flags)
> {
> + if (flags != PostingsEnum.NONE) {
> + // We only work with DOCS_ONLY fields
> + return null;
> + }
> + if (liveDocs != null) {
> + throw new IllegalArgumentException("cannot handle live docs");
> + }
> + postingsEnum.reset();
> + return postingsEnum;
> + }
> +}
>
> Modified:
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java?rev=1670918&r1=1670917&r2=1670918&view=diff
> ==============================================================================
> ---
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
> (original)
> +++
> lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
> Thu Apr 2 15:05:48 2015
> @@ -34,6 +34,8 @@ import org.apache.lucene.index.IndexFile
> import org.apache.lucene.index.IndexOptions;
> import org.apache.lucene.index.SegmentReadState;
> import org.apache.lucene.index.Terms;
> +import org.apache.lucene.search.PrefixQuery; // javadocs
> +import org.apache.lucene.search.TermRangeQuery; // javadocs
> import org.apache.lucene.store.IndexInput;
> import org.apache.lucene.util.Accountable;
> import org.apache.lucene.util.Accountables;
> @@ -57,6 +59,14 @@ import org.apache.lucene.util.fst.Output
> * min/maxItemsPerBlock during indexing to control how
> * much memory the terms index uses.</p>
> *
> + * <p>If auto-prefix terms were indexed (see
> + * {@link BlockTreeTermsWriter}), then the {@link Terms#intersect}
> + * implementation here will make use of these terms only if the
> + * automaton has a binary sink state, i.e. an accept state
> + * which has a transition to itself accepting all byte values.
> + * For example, both {@link PrefixQuery} and {@link TermRangeQuery}
> + * pass such automata to {@link Terms#intersect}.</p>
> + *
> * <p>The data structure used by this implementation is very
> * similar to a burst trie
> * (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499),
> @@ -90,8 +100,11 @@ public final class BlockTreeTermsReader
> /** Initial terms format. */
> public static final int VERSION_START = 0;
>
> + /** Auto-prefix terms. */
> + public static final int VERSION_AUTO_PREFIX_TERMS = 1;
> +
> /** Current terms format. */
> - public static final int VERSION_CURRENT = VERSION_START;
> + public static final int VERSION_CURRENT = VERSION_AUTO_PREFIX_TERMS;
>
> /** Extension of terms index file */
> static final String TERMS_INDEX_EXTENSION = "tip";
> @@ -116,7 +129,7 @@ public final class BlockTreeTermsReader
>
> final String segment;
>
> - private final int version;
> + final int version;
>
> /** Sole constructor. */
> public BlockTreeTermsReader(PostingsReaderBase postingsReader,
> SegmentReadState state) throws IOException {
>
>
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]