http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/991fb043/blur-store/src/main/java/org/apache/lucene/codecs/BlockTreeTermsReader.java ---------------------------------------------------------------------- diff --git a/blur-store/src/main/java/org/apache/lucene/codecs/BlockTreeTermsReader.java b/blur-store/src/main/java/org/apache/lucene/codecs/BlockTreeTermsReader.java new file mode 100644 index 0000000..4ef65ff --- /dev/null +++ b/blur-store/src/main/java/org/apache/lucene/codecs/BlockTreeTermsReader.java @@ -0,0 +1,2939 @@ +package org.apache.lucene.codecs; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.io.UnsupportedEncodingException; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.Locale; +import java.util.TreeMap; + +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Outputs; +import org.apache.lucene.util.fst.Util; + +/** A block-based terms index and dictionary that assigns + * terms to variable length blocks according to how they + * share prefixes. The terms index is a prefix trie + * whose leaves are term blocks. The advantage of this + * approach is that seekExact is often able to + * determine a term cannot exist without doing any IO, and + * intersection with Automata is very fast. Note that this + * terms dictionary has it's own fixed terms index (ie, it + * does not support a pluggable terms index + * implementation). + * + * <p><b>NOTE</b>: this terms dictionary does not support + * index divisor when opening an IndexReader. Instead, you + * can change the min/maxItemsPerBlock during indexing.</p> + * + * <p>The data structure used by this implementation is very + * similar to a burst trie + * (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499), + * but with added logic to break up too-large blocks of all + * terms sharing a given prefix into smaller ones.</p> + * + * <p>Use {@link org.apache.lucene.index.CheckIndex} with the <code>-verbose</code> + * option to see summary statistics on the blocks in the + * dictionary. + * + * See {@link BlockTreeTermsWriter}. + * + * @lucene.experimental + */ + +public class BlockTreeTermsReader extends FieldsProducer { + + // Open input to the main terms dict file (_X.tib) + private final IndexInput in; + + //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + // Reads the terms dict entries, to gather state to + // produce DocsEnum on demand + private final PostingsReaderBase postingsReader; + + private final TreeMap<String,FieldReader> fields = new TreeMap<String,FieldReader>(); + + /** File offset where the directory starts in the terms file. */ + private long dirOffset; + + /** File offset where the directory starts in the index file. */ + private long indexDirOffset; + + private String segment; + + private final int version; + + private final IndexInput indexIn; + + /** Sole constructor. */ + public BlockTreeTermsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo info, + PostingsReaderBase postingsReader, IOContext ioContext, + String segmentSuffix, int indexDivisor) + throws IOException { + + this.postingsReader = postingsReader; + + this.segment = info.name; + in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, BlockTreeTermsWriter.TERMS_EXTENSION), + ioContext); + + boolean success = false; + + try { + version = readHeader(in); + if (indexDivisor != -1) { + indexIn = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, BlockTreeTermsWriter.TERMS_INDEX_EXTENSION), + ioContext); + int indexVersion = readIndexHeader(indexIn); + if (indexVersion != version) { + throw new CorruptIndexException("mixmatched version files: " + in + "=" + version + "," + indexIn + "=" + indexVersion); + } + } else { + indexIn = null; + } + + // Have PostingsReader init itself + postingsReader.init(in); + + // Read per-field details + seekDir(in, dirOffset); + if (indexDivisor != -1) { + seekDir(indexIn, indexDirOffset); + } + + final int numFields = in.readVInt(); + if (numFields < 0) { + throw new CorruptIndexException("invalid numFields: " + numFields + " (resource=" + in + ")"); + } + + for(int i=0;i<numFields;i++) { + final int field = in.readVInt(); + final long numTerms = in.readVLong(); + assert numTerms >= 0; + final int numBytes = in.readVInt(); + final BytesRef rootCode = new BytesRef(new byte[numBytes]); + in.readBytes(rootCode.bytes, 0, numBytes); + rootCode.length = numBytes; + final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + assert fieldInfo != null: "field=" + field; + final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); + final long sumDocFreq = in.readVLong(); + final int docCount = in.readVInt(); + if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs + throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")"); + } + if (sumDocFreq < docCount) { // #postings must be >= #docs with field + throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount + " (resource=" + in + ")"); + } + if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings + throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + in + ")"); + } + final long indexStartFP = indexDivisor != -1 ? indexIn.readVLong() : 0; + FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, indexStartFP, indexIn)); + if (previous != null) { + throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")"); + } + } + + success = true; + } finally { + if (!success) { + // this.close() will close in: + IOUtils.closeWhileHandlingException(this); + } + } + } + + /** Reads terms file header. */ + protected int readHeader(IndexInput input) throws IOException { + int version = CodecUtil.checkHeader(input, BlockTreeTermsWriter.TERMS_CODEC_NAME, + BlockTreeTermsWriter.TERMS_VERSION_START, + BlockTreeTermsWriter.TERMS_VERSION_CURRENT); + if (version < BlockTreeTermsWriter.TERMS_VERSION_APPEND_ONLY) { + dirOffset = input.readLong(); + } + return version; + } + + /** Reads index file header. */ + protected int readIndexHeader(IndexInput input) throws IOException { + int version = CodecUtil.checkHeader(input, BlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME, + BlockTreeTermsWriter.TERMS_INDEX_VERSION_START, + BlockTreeTermsWriter.TERMS_INDEX_VERSION_CURRENT); + if (version < BlockTreeTermsWriter.TERMS_INDEX_VERSION_APPEND_ONLY) { + indexDirOffset = input.readLong(); + } + return version; + } + + /** Seek {@code input} to the directory offset. */ + protected void seekDir(IndexInput input, long dirOffset) + throws IOException { + if (version >= BlockTreeTermsWriter.TERMS_INDEX_VERSION_APPEND_ONLY) { + input.seek(input.length() - 8); + dirOffset = input.readLong(); + } + input.seek(dirOffset); + } + + // for debugging + // private static String toHex(int v) { + // return "0x" + Integer.toHexString(v); + // } + + @Override + public void close() throws IOException { + try { + IOUtils.close(indexIn, in, postingsReader); + } finally { + // Clear so refs to terms index is GCable even if + // app hangs onto us: + fields.clear(); + } + } + + @Override + public Iterator<String> iterator() { + return Collections.unmodifiableSet(fields.keySet()).iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + assert field != null; + return fields.get(field); + } + + @Override + public int size() { + return fields.size(); + } + + // for debugging + String brToString(BytesRef b) { + if (b == null) { + return "null"; + } else { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: + return b.toString(); + } + } + } + + /** + * BlockTree statistics for a single field + * returned by {@link FieldReader#computeStats()}. + */ + public static class Stats { + /** How many nodes in the index FST. */ + public long indexNodeCount; + + /** How many arcs in the index FST. */ + public long indexArcCount; + + /** Byte size of the index. */ + public long indexNumBytes; + + /** Total number of terms in the field. */ + public long totalTermCount; + + /** Total number of bytes (sum of term lengths) across all terms in the field. */ + public long totalTermBytes; + + /** The number of normal (non-floor) blocks in the terms file. */ + public int nonFloorBlockCount; + + /** The number of floor blocks (meta-blocks larger than the + * allowed {@code maxItemsPerBlock}) in the terms file. */ + public int floorBlockCount; + + /** The number of sub-blocks within the floor blocks. */ + public int floorSubBlockCount; + + /** The number of "internal" blocks (that have both + * terms and sub-blocks). */ + public int mixedBlockCount; + + /** The number of "leaf" blocks (blocks that have only + * terms). */ + public int termsOnlyBlockCount; + + /** The number of "internal" blocks that do not contain + * terms (have only sub-blocks). */ + public int subBlocksOnlyBlockCount; + + /** Total number of blocks. */ + public int totalBlockCount; + + /** Number of blocks at each prefix depth. */ + public int[] blockCountByPrefixLen = new int[10]; + private int startBlockCount; + private int endBlockCount; + + /** Total number of bytes used to store term suffixes. */ + public long totalBlockSuffixBytes; + + /** Total number of bytes used to store term stats (not + * including what the {@link PostingsBaseFormat} + * stores. */ + public long totalBlockStatsBytes; + + /** Total bytes stored by the {@link PostingsBaseFormat}, + * plus the other few vInts stored in the frame. */ + public long totalBlockOtherBytes; + + /** Segment name. */ + public final String segment; + + /** Field name. */ + public final String field; + + Stats(String segment, String field) { + this.segment = segment; + this.field = field; + } + + void startBlock(FieldReader.SegmentTermsEnum.Frame frame, boolean isFloor) { + totalBlockCount++; + if (isFloor) { + if (frame.fp == frame.fpOrig) { + floorBlockCount++; + } + floorSubBlockCount++; + } else { + nonFloorBlockCount++; + } + + if (blockCountByPrefixLen.length <= frame.prefix) { + blockCountByPrefixLen = ArrayUtil.grow(blockCountByPrefixLen, 1+frame.prefix); + } + blockCountByPrefixLen[frame.prefix]++; + startBlockCount++; + totalBlockSuffixBytes += frame.suffixesReader.length(); + totalBlockStatsBytes += frame.statsReader.length(); + } + + void endBlock(FieldReader.SegmentTermsEnum.Frame frame) { + final int termCount = frame.isLeafBlock ? frame.entCount : frame.state.termBlockOrd; + final int subBlockCount = frame.entCount - termCount; + totalTermCount += termCount; + if (termCount != 0 && subBlockCount != 0) { + mixedBlockCount++; + } else if (termCount != 0) { + termsOnlyBlockCount++; + } else if (subBlockCount != 0) { + subBlocksOnlyBlockCount++; + } else { + throw new IllegalStateException(); + } + endBlockCount++; + final long otherBytes = frame.fpEnd - frame.fp - frame.suffixesReader.length() - frame.statsReader.length(); + assert otherBytes > 0 : "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd; + totalBlockOtherBytes += otherBytes; + } + + void term(BytesRef term) { + totalTermBytes += term.length; + } + + void finish() { + assert startBlockCount == endBlockCount: "startBlockCount=" + startBlockCount + " endBlockCount=" + endBlockCount; + assert totalBlockCount == floorSubBlockCount + nonFloorBlockCount: "floorSubBlockCount=" + floorSubBlockCount + " nonFloorBlockCount=" + nonFloorBlockCount + " totalBlockCount=" + totalBlockCount; + assert totalBlockCount == mixedBlockCount + termsOnlyBlockCount + subBlocksOnlyBlockCount: "totalBlockCount=" + totalBlockCount + " mixedBlockCount=" + mixedBlockCount + " subBlocksOnlyBlockCount=" + subBlocksOnlyBlockCount + " termsOnlyBlockCount=" + termsOnlyBlockCount; + } + + @Override + public String toString() { + final ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); + PrintStream out; + try { + out = new PrintStream(bos, false, "UTF-8"); + } catch (UnsupportedEncodingException bogus) { + throw new RuntimeException(bogus); + } + + out.println(" index FST:"); + out.println(" " + indexNodeCount + " nodes"); + out.println(" " + indexArcCount + " arcs"); + out.println(" " + indexNumBytes + " bytes"); + out.println(" terms:"); + out.println(" " + totalTermCount + " terms"); + out.println(" " + totalTermBytes + " bytes" + (totalTermCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalTermBytes)/totalTermCount) + " bytes/term)" : "")); + out.println(" blocks:"); + out.println(" " + totalBlockCount + " blocks"); + out.println(" " + termsOnlyBlockCount + " terms-only blocks"); + out.println(" " + subBlocksOnlyBlockCount + " sub-block-only blocks"); + out.println(" " + mixedBlockCount + " mixed blocks"); + out.println(" " + floorBlockCount + " floor blocks"); + out.println(" " + (totalBlockCount-floorSubBlockCount) + " non-floor blocks"); + out.println(" " + floorSubBlockCount + " floor sub-blocks"); + out.println(" " + totalBlockSuffixBytes + " term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockSuffixBytes)/totalBlockCount) + " suffix-bytes/block)" : "")); + out.println(" " + totalBlockStatsBytes + " term stats bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : "")); + out.println(" " + totalBlockOtherBytes + " other bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockOtherBytes)/totalBlockCount) + " other-bytes/block)" : "")); + if (totalBlockCount != 0) { + out.println(" by prefix length:"); + int total = 0; + for(int prefix=0;prefix<blockCountByPrefixLen.length;prefix++) { + final int blockCount = blockCountByPrefixLen[prefix]; + total += blockCount; + if (blockCount != 0) { + out.println(" " + String.format(Locale.ROOT, "%2d", prefix) + ": " + blockCount); + } + } + assert totalBlockCount == total; + } + + try { + return bos.toString("UTF-8"); + } catch (UnsupportedEncodingException bogus) { + throw new RuntimeException(bogus); + } + } + } + + final Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton(); + final BytesRef NO_OUTPUT = fstOutputs.getNoOutput(); + + /** BlockTree's implementation of {@link Terms}. */ + public final class FieldReader extends Terms { + final long numTerms; + final FieldInfo fieldInfo; + final long sumTotalTermFreq; + final long sumDocFreq; + final int docCount; + final long indexStartFP; + final long rootBlockFP; + final BytesRef rootCode; + private final FST<BytesRef> index; + + //private boolean DEBUG; + + FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, IndexInput indexIn) throws IOException { + assert numTerms > 0; + this.fieldInfo = fieldInfo; + //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); + this.numTerms = numTerms; + this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; + this.docCount = docCount; + this.indexStartFP = indexStartFP; + this.rootCode = rootCode; + // if (DEBUG) { + // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor); + // } + + rootBlockFP = (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() >>> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; + + if (indexIn != null) { + final IndexInput clone = indexIn.clone(); + //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name); + clone.seek(indexStartFP); + index = new FST<BytesRef>(clone, ByteSequenceOutputs.getSingleton()); + + /* + if (false) { + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + Util.toDot(index, w, false, false); + System.out.println("FST INDEX: SAVED to " + dotFileName); + w.close(); + } + */ + } else { + index = null; + } + } + + /** For debugging -- used by CheckIndex too*/ + // TODO: maybe push this into Terms? + public Stats computeStats() throws IOException { + return new SegmentTermsEnum().computeBlockStats(); + } + + @Override + public Comparator<BytesRef> getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public boolean hasOffsets() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + } + + @Override + public boolean hasPositions() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + } + + @Override + public boolean hasPayloads() { + return fieldInfo.hasPayloads(); + } + + @Override + public TermsEnum iterator(TermsEnum reuse) throws IOException { + return new SegmentTermsEnum(); + } + + @Override + public long size() { + return numTerms; + } + + @Override + public long getSumTotalTermFreq() { + return sumTotalTermFreq; + } + + @Override + public long getSumDocFreq() { + return sumDocFreq; + } + + @Override + public int getDocCount() { + return docCount; + } + + @Override + public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } + return new IntersectEnum(compiled, startTerm); + } + + // NOTE: cannot seek! + private final class IntersectEnum extends TermsEnum { + private final IndexInput in; + + private Frame[] stack; + + @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc<BytesRef>[] arcs = new FST.Arc[5]; + + private final RunAutomaton runAutomaton; + private final CompiledAutomaton compiledAutomaton; + + private Frame currentFrame; + + private final BytesRef term = new BytesRef(); + + private final FST.BytesReader fstReader; + + // TODO: can we share this with the frame in STE? + private final class Frame { + final int ord; + long fp; + long fpOrig; + long fpEnd; + long lastSubFP; + + // State in automaton + int state; + + int metaDataUpto; + + byte[] suffixBytes = new byte[128]; + final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); + + byte[] statBytes = new byte[64]; + final ByteArrayDataInput statsReader = new ByteArrayDataInput(); + + byte[] floorData = new byte[32]; + final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); + + // Length of prefix shared by all terms in this block + int prefix; + + // Number of entries (term or sub-block) in this block + int entCount; + + // Which term we will next read + int nextEnt; + + // True if this block is either not a floor block, + // or, it's the last sub-block of a floor block + boolean isLastInFloor; + + // True if all entries are terms + boolean isLeafBlock; + + int numFollowFloorBlocks; + int nextFloorLabel; + + Transition[] transitions; + int curTransitionMax; + int transitionIndex; + + FST.Arc<BytesRef> arc; + + final BlockTermState termState; + + // Cumulative output so far + BytesRef outputPrefix; + + private int startBytePos; + private int suffix; + + public Frame(int ord) throws IOException { + this.ord = ord; + termState = postingsReader.newTermState(); + termState.totalTermFreq = -1; + } + + void loadNextFloorBlock() throws IOException { + assert numFollowFloorBlocks > 0; + //if (DEBUG) System.out.println(" loadNextFoorBlock trans=" + transitions[transitionIndex]); + + do { + fp = fpOrig + (floorDataReader.readVLong() >>> 1); + numFollowFloorBlocks--; + // if (DEBUG) System.out.println(" skip floor block2! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[transitionIndex].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks); + if (numFollowFloorBlocks != 0) { + nextFloorLabel = floorDataReader.readByte() & 0xff; + } else { + nextFloorLabel = 256; + } + // if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel); + } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[transitionIndex].getMin()); + + load(null); + } + + public void setState(int state) { + this.state = state; + transitionIndex = 0; + transitions = compiledAutomaton.sortedTransitions[state]; + if (transitions.length != 0) { + curTransitionMax = transitions[0].getMax(); + } else { + curTransitionMax = -1; + } + } + + void load(BytesRef frameIndexData) throws IOException { + + // if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state)); + + if (frameIndexData != null && transitions.length != 0) { + // Floor frame + if (floorData.length < frameIndexData.length) { + this.floorData = new byte[ArrayUtil.oversize(frameIndexData.length, 1)]; + } + System.arraycopy(frameIndexData.bytes, frameIndexData.offset, floorData, 0, frameIndexData.length); + floorDataReader.reset(floorData, 0, frameIndexData.length); + // Skip first long -- has redundant fp, hasTerms + // flag, isFloor flag + final long code = floorDataReader.readVLong(); + if ((code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0) { + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + // if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel); + + // If current state is accept, we must process + // first block in case it has empty suffix: + if (!runAutomaton.isAccept(state)) { + // Maybe skip floor blocks: + while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[0].getMin()) { + fp = fpOrig + (floorDataReader.readVLong() >>> 1); + numFollowFloorBlocks--; + // if (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks); + if (numFollowFloorBlocks != 0) { + nextFloorLabel = floorDataReader.readByte() & 0xff; + } else { + nextFloorLabel = 256; + } + } + } + } + } + + in.seek(fp); + int code = in.readVInt(); + entCount = code >>> 1; + assert entCount > 0; + isLastInFloor = (code & 1) != 0; + + // term suffixes: + code = in.readVInt(); + isLeafBlock = (code & 1) != 0; + int numBytes = code >>> 1; + // if (DEBUG) System.out.println(" entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes); + if (suffixBytes.length < numBytes) { + suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + in.readBytes(suffixBytes, 0, numBytes); + suffixesReader.reset(suffixBytes, 0, numBytes); + + // stats + numBytes = in.readVInt(); + if (statBytes.length < numBytes) { + statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + in.readBytes(statBytes, 0, numBytes); + statsReader.reset(statBytes, 0, numBytes); + metaDataUpto = 0; + + termState.termBlockOrd = 0; + nextEnt = 0; + + postingsReader.readTermsBlock(in, fieldInfo, termState); + + if (!isLastInFloor) { + // Sub-blocks of a single floor block are always + // written one after another -- tail recurse: + fpEnd = in.getFilePointer(); + } + } + + // TODO: maybe add scanToLabel; should give perf boost + + public boolean next() { + return isLeafBlock ? nextLeaf() : nextNonLeaf(); + } + + // Decodes next entry; returns true if it's a sub-block + public boolean nextLeaf() { + //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + suffix = suffixesReader.readVInt(); + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffix); + return false; + } + + public boolean nextNonLeaf() { + //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + final int code = suffixesReader.readVInt(); + suffix = code >>> 1; + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffix); + if ((code & 1) == 0) { + // A normal term + termState.termBlockOrd++; + return false; + } else { + // A sub-block; make sub-FP absolute: + lastSubFP = fp - suffixesReader.readVLong(); + return true; + } + } + + public int getTermBlockOrd() { + return isLeafBlock ? nextEnt : termState.termBlockOrd; + } + + public void decodeMetaData() throws IOException { + + // lazily catch up on metadata decode: + final int limit = getTermBlockOrd(); + assert limit > 0; + + // We must set/incr state.termCount because + // postings impl can look at this + termState.termBlockOrd = metaDataUpto; + + // TODO: better API would be "jump straight to term=N"??? + while (metaDataUpto < limit) { + + // TODO: we could make "tiers" of metadata, ie, + // decode docFreq/totalTF but don't decode postings + // metadata; this way caller could get + // docFreq/totalTF w/o paying decode cost for + // postings + + // TODO: if docFreq were bulk decoded we could + // just skipN here: + termState.docFreq = statsReader.readVInt(); + //if (DEBUG) System.out.println(" dF=" + state.docFreq); + if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { + termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); + //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); + } + + postingsReader.nextTerm(fieldInfo, termState); + metaDataUpto++; + termState.termBlockOrd++; + } + } + } + + private BytesRef savedStartTerm; + + // TODO: in some cases we can filter by length? eg + // regexp foo*bar must be at least length 6 bytes + public IntersectEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + // if (DEBUG) { + // System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef)); + // } + runAutomaton = compiled.runAutomaton; + compiledAutomaton = compiled; + in = BlockTreeTermsReader.this.in.clone(); + stack = new Frame[5]; + for(int idx=0;idx<stack.length;idx++) { + stack[idx] = new Frame(idx); + } + for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) { + arcs[arcIdx] = new FST.Arc<BytesRef>(); + } + + if (index == null) { + fstReader = null; + } else { + fstReader = index.getBytesReader(); + } + + // TODO: if the automaton is "smallish" we really + // should use the terms index to seek at least to + // the initial term and likely to subsequent terms + // (or, maybe just fallback to ATE for such cases). + // Else the seek cost of loading the frames will be + // too costly. + + final FST.Arc<BytesRef> arc = index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + + // Special pushFrame since it's the first one: + final Frame f = stack[0]; + f.fp = f.fpOrig = rootBlockFP; + f.prefix = 0; + f.setState(runAutomaton.getInitialState()); + f.arc = arc; + f.outputPrefix = arc.output; + f.load(rootCode); + + // for assert: + assert setSavedStartTerm(startTerm); + + currentFrame = f; + if (startTerm != null) { + seekToStartTerm(startTerm); + } + } + + // only for assert: + private boolean setSavedStartTerm(BytesRef startTerm) { + savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm); + return true; + } + + @Override + public TermState termState() throws IOException { + currentFrame.decodeMetaData(); + return currentFrame.termState.clone(); + } + + private Frame getFrame(int ord) throws IOException { + if (ord >= stack.length) { + final Frame[] next = new Frame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(stack, 0, next, 0, stack.length); + for(int stackOrd=stack.length;stackOrd<next.length;stackOrd++) { + next[stackOrd] = new Frame(stackOrd); + } + stack = next; + } + assert stack[ord].ord == ord; + return stack[ord]; + } + + private FST.Arc<BytesRef> getArc(int ord) { + if (ord >= arcs.length) { + @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<BytesRef>[] next = + new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, next, 0, arcs.length); + for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) { + next[arcOrd] = new FST.Arc<BytesRef>(); + } + arcs = next; + } + return arcs[ord]; + } + + private Frame pushFrame(int state) throws IOException { + final Frame f = getFrame(currentFrame == null ? 0 : 1+currentFrame.ord); + + f.fp = f.fpOrig = currentFrame.lastSubFP; + f.prefix = currentFrame.prefix + currentFrame.suffix; + // if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix); + f.setState(state); + + // Walk the arc through the index -- we only + // "bother" with this so we can get the floor data + // from the index and skip floor blocks when + // possible: + FST.Arc<BytesRef> arc = currentFrame.arc; + int idx = currentFrame.prefix; + assert currentFrame.suffix > 0; + BytesRef output = currentFrame.outputPrefix; + while (idx < f.prefix) { + final int target = term.bytes[idx] & 0xff; + // TODO: we could be more efficient for the next() + // case by using current arc as starting point, + // passed to findTargetArc + arc = index.findTargetArc(target, arc, getArc(1+idx), fstReader); + assert arc != null; + output = fstOutputs.add(output, arc.output); + idx++; + } + + f.arc = arc; + f.outputPrefix = output; + assert arc.isFinal(); + f.load(fstOutputs.add(output, arc.nextFinalOutput)); + return f; + } + + @Override + public BytesRef term() { + return term; + } + + @Override + public int docFreq() throws IOException { + //if (DEBUG) System.out.println("BTIR.docFreq"); + currentFrame.decodeMetaData(); + //if (DEBUG) System.out.println(" return " + currentFrame.termState.docFreq); + return currentFrame.termState.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + currentFrame.decodeMetaData(); + return currentFrame.termState.totalTermFreq; + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse, int flags) throws IOException { + currentFrame.decodeMetaData(); + return postingsReader.docs(fieldInfo, currentFrame.termState, skipDocs, reuse, flags); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { + // Positions were not indexed: + return null; + } + + currentFrame.decodeMetaData(); + return postingsReader.docsAndPositions(fieldInfo, currentFrame.termState, skipDocs, reuse, flags); + } + + private int getState() { + int state = currentFrame.state; + for(int idx=0;idx<currentFrame.suffix;idx++) { + state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff); + assert state != -1; + } + return state; + } + + // NOTE: specialized to only doing the first-time + // seek, but we could generalize it to allow + // arbitrary seekExact/Ceil. Note that this is a + // seekFloor! + private void seekToStartTerm(BytesRef target) throws IOException { + //if (DEBUG) System.out.println("seek to startTerm=" + target.utf8ToString()); + assert currentFrame.ord == 0; + if (term.length < target.length) { + term.bytes = ArrayUtil.grow(term.bytes, target.length); + } + FST.Arc<BytesRef> arc = arcs[0]; + assert arc == currentFrame.arc; + + for(int idx=0;idx<=target.length;idx++) { + + while (true) { + final int savePos = currentFrame.suffixesReader.getPosition(); + final int saveStartBytePos = currentFrame.startBytePos; + final int saveSuffix = currentFrame.suffix; + final long saveLastSubFP = currentFrame.lastSubFP; + final int saveTermBlockOrd = currentFrame.termState.termBlockOrd; + + final boolean isSubBlock = currentFrame.next(); + + //if (DEBUG) System.out.println(" cycle ent=" + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") prefix=" + currentFrame.prefix + " suffix=" + currentFrame.suffix + " isBlock=" + isSubBlock + " firstLabel=" + (currentFrame.suffix == 0 ? "" : (currentFrame.suffixBytes[currentFrame.startBytePos])&0xff)); + term.length = currentFrame.prefix + currentFrame.suffix; + if (term.bytes.length < term.length) { + term.bytes = ArrayUtil.grow(term.bytes, term.length); + } + System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); + + if (isSubBlock && StringHelper.startsWith(target, term)) { + // Recurse + //if (DEBUG) System.out.println(" recurse!"); + currentFrame = pushFrame(getState()); + break; + } else { + final int cmp = term.compareTo(target); + if (cmp < 0) { + if (currentFrame.nextEnt == currentFrame.entCount) { + if (!currentFrame.isLastInFloor) { + //if (DEBUG) System.out.println(" load floorBlock"); + currentFrame.loadNextFloorBlock(); + continue; + } else { + //if (DEBUG) System.out.println(" return term=" + brToString(term)); + return; + } + } + continue; + } else if (cmp == 0) { + //if (DEBUG) System.out.println(" return term=" + brToString(term)); + return; + } else { + // Fallback to prior entry: the semantics of + // this method is that the first call to + // next() will return the term after the + // requested term + currentFrame.nextEnt--; + currentFrame.lastSubFP = saveLastSubFP; + currentFrame.startBytePos = saveStartBytePos; + currentFrame.suffix = saveSuffix; + currentFrame.suffixesReader.setPosition(savePos); + currentFrame.termState.termBlockOrd = saveTermBlockOrd; + System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); + term.length = currentFrame.prefix + currentFrame.suffix; + // If the last entry was a block we don't + // need to bother recursing and pushing to + // the last term under it because the first + // next() will simply skip the frame anyway + return; + } + } + } + } + + assert false; + } + + @Override + public BytesRef next() throws IOException { + + // if (DEBUG) { + // System.out.println("\nintEnum.next seg=" + segment); + // System.out.println(" frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); + // } + + nextTerm: + while(true) { + // Pop finished frames + while (currentFrame.nextEnt == currentFrame.entCount) { + if (!currentFrame.isLastInFloor) { + //if (DEBUG) System.out.println(" next-floor-block"); + currentFrame.loadNextFloorBlock(); + //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); + } else { + //if (DEBUG) System.out.println(" pop frame"); + if (currentFrame.ord == 0) { + return null; + } + final long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord-1]; + assert currentFrame.lastSubFP == lastFP; + //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); + } + } + + final boolean isSubBlock = currentFrame.next(); + // if (DEBUG) { + // final BytesRef suffixRef = new BytesRef(); + // suffixRef.bytes = currentFrame.suffixBytes; + // suffixRef.offset = currentFrame.startBytePos; + // suffixRef.length = currentFrame.suffix; + // System.out.println(" " + (isSubBlock ? "sub-block" : "term") + " " + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" + brToString(suffixRef)); + // } + + if (currentFrame.suffix != 0) { + final int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff; + while (label > currentFrame.curTransitionMax) { + if (currentFrame.transitionIndex >= currentFrame.transitions.length-1) { + // Stop processing this frame -- no further + // matches are possible because we've moved + // beyond what the max transition will allow + //if (DEBUG) System.out.println(" break: trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex])); + + // sneaky! forces a pop above + currentFrame.isLastInFloor = true; + currentFrame.nextEnt = currentFrame.entCount; + continue nextTerm; + } + currentFrame.transitionIndex++; + currentFrame.curTransitionMax = currentFrame.transitions[currentFrame.transitionIndex].getMax(); + //if (DEBUG) System.out.println(" next trans=" + currentFrame.transitions[currentFrame.transitionIndex]); + } + } + + // First test the common suffix, if set: + if (compiledAutomaton.commonSuffixRef != null && !isSubBlock) { + final int termLen = currentFrame.prefix + currentFrame.suffix; + if (termLen < compiledAutomaton.commonSuffixRef.length) { + // No match + // if (DEBUG) { + // System.out.println(" skip: common suffix length"); + // } + continue nextTerm; + } + + final byte[] suffixBytes = currentFrame.suffixBytes; + final byte[] commonSuffixBytes = compiledAutomaton.commonSuffixRef.bytes; + + final int lenInPrefix = compiledAutomaton.commonSuffixRef.length - currentFrame.suffix; + assert compiledAutomaton.commonSuffixRef.offset == 0; + int suffixBytesPos; + int commonSuffixBytesPos = 0; + + if (lenInPrefix > 0) { + // A prefix of the common suffix overlaps with + // the suffix of the block prefix so we first + // test whether the prefix part matches: + final byte[] termBytes = term.bytes; + int termBytesPos = currentFrame.prefix - lenInPrefix; + assert termBytesPos >= 0; + final int termBytesPosEnd = currentFrame.prefix; + while (termBytesPos < termBytesPosEnd) { + if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { + // if (DEBUG) { + // System.out.println(" skip: common suffix mismatch (in prefix)"); + // } + continue nextTerm; + } + } + suffixBytesPos = currentFrame.startBytePos; + } else { + suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - compiledAutomaton.commonSuffixRef.length; + } + + // Test overlapping suffix part: + final int commonSuffixBytesPosEnd = compiledAutomaton.commonSuffixRef.length; + while (commonSuffixBytesPos < commonSuffixBytesPosEnd) { + if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { + // if (DEBUG) { + // System.out.println(" skip: common suffix mismatch"); + // } + continue nextTerm; + } + } + } + + // TODO: maybe we should do the same linear test + // that AutomatonTermsEnum does, so that if we + // reach a part of the automaton where .* is + // "temporarily" accepted, we just blindly .next() + // until the limit + + // See if the term prefix matches the automaton: + int state = currentFrame.state; + for (int idx=0;idx<currentFrame.suffix;idx++) { + state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff); + if (state == -1) { + // No match + //System.out.println(" no s=" + state); + continue nextTerm; + } else { + //System.out.println(" c s=" + state); + } + } + + if (isSubBlock) { + // Match! Recurse: + //if (DEBUG) System.out.println(" sub-block match to state=" + state + "; recurse fp=" + currentFrame.lastSubFP); + copyTerm(); + currentFrame = pushFrame(state); + //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); + } else if (runAutomaton.isAccept(state)) { + copyTerm(); + //if (DEBUG) System.out.println(" term match to state=" + state + "; return term=" + brToString(term)); + assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0: "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString(); + return term; + } else { + //System.out.println(" no s=" + state); + } + } + } + + private void copyTerm() { + //System.out.println(" copyTerm cur.prefix=" + currentFrame.prefix + " cur.suffix=" + currentFrame.suffix + " first=" + (char) currentFrame.suffixBytes[currentFrame.startBytePos]); + final int len = currentFrame.prefix + currentFrame.suffix; + if (term.bytes.length < len) { + term.bytes = ArrayUtil.grow(term.bytes, len); + } + System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); + term.length = len; + } + + @Override + public Comparator<BytesRef> getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public boolean seekExact(BytesRef text, boolean useCache) { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(long ord) { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() { + throw new UnsupportedOperationException(); + } + + @Override + public SeekStatus seekCeil(BytesRef text, boolean useCache) { + throw new UnsupportedOperationException(); + } + } + + // Iterates through terms in this field + private final class SegmentTermsEnum extends TermsEnum { + private IndexInput in; + + private Frame[] stack; + private final Frame staticFrame; + private Frame currentFrame; + private boolean termExists; + + private int targetBeforeCurrentLength; + + private final ByteArrayDataInput scratchReader = new ByteArrayDataInput(); + + // What prefix of the current term was present in the index: + private int validIndexPrefix; + + // assert only: + private boolean eof; + + final BytesRef term = new BytesRef(); + private final FST.BytesReader fstReader; + + @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc<BytesRef>[] arcs = + new FST.Arc[1]; + + public SegmentTermsEnum() throws IOException { + //if (DEBUG) System.out.println("BTTR.init seg=" + segment); + stack = new Frame[0]; + + // Used to hold seek by TermState, or cached seek + staticFrame = new Frame(-1); + + if (index == null) { + fstReader = null; + } else { + fstReader = index.getBytesReader(); + } + + // Init w/ root block; don't use index since it may + // not (and need not) have been loaded + for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) { + arcs[arcIdx] = new FST.Arc<BytesRef>(); + } + + currentFrame = staticFrame; + final FST.Arc<BytesRef> arc; + if (index != null) { + arc = index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + currentFrame = staticFrame; + //currentFrame = pushFrame(arc, rootCode, 0); + //currentFrame.loadBlock(); + validIndexPrefix = 0; + // if (DEBUG) { + // System.out.println("init frame state " + currentFrame.ord); + // printSeekState(); + // } + + //System.out.println(); + // computeBlockStats().print(System.out); + } + + // Not private to avoid synthetic access$NNN methods + void initIndexInput() { + if (this.in == null) { + this.in = BlockTreeTermsReader.this.in.clone(); + } + } + + /** Runs next() through the entire terms dict, + * computing aggregate statistics. */ + public Stats computeBlockStats() throws IOException { + + Stats stats = new Stats(segment, fieldInfo.name); + if (index != null) { + stats.indexNodeCount = index.getNodeCount(); + stats.indexArcCount = index.getArcCount(); + stats.indexNumBytes = index.sizeInBytes(); + } + + currentFrame = staticFrame; + FST.Arc<BytesRef> arc; + if (index != null) { + arc = index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + + // Empty string prefix must have an output in the + // index! + currentFrame = pushFrame(arc, rootCode, 0); + currentFrame.fpOrig = currentFrame.fp; + currentFrame.loadBlock(); + validIndexPrefix = 0; + + stats.startBlock(currentFrame, !currentFrame.isLastInFloor); + + allTerms: + while (true) { + + // Pop finished blocks + while (currentFrame.nextEnt == currentFrame.entCount) { + stats.endBlock(currentFrame); + if (!currentFrame.isLastInFloor) { + currentFrame.loadNextFloorBlock(); + stats.startBlock(currentFrame, true); + } else { + if (currentFrame.ord == 0) { + break allTerms; + } + final long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord-1]; + assert lastFP == currentFrame.lastSubFP; + // if (DEBUG) { + // System.out.println(" reset validIndexPrefix=" + validIndexPrefix); + // } + } + } + + while(true) { + if (currentFrame.next()) { + // Push to new block: + currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length); + currentFrame.fpOrig = currentFrame.fp; + // This is a "next" frame -- even if it's + // floor'd we must pretend it isn't so we don't + // try to scan to the right floor frame: + currentFrame.isFloor = false; + //currentFrame.hasTerms = true; + currentFrame.loadBlock(); + stats.startBlock(currentFrame, !currentFrame.isLastInFloor); + } else { + stats.term(term); + break; + } + } + } + + stats.finish(); + + // Put root frame back: + currentFrame = staticFrame; + if (index != null) { + arc = index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + currentFrame = pushFrame(arc, rootCode, 0); + currentFrame.rewind(); + currentFrame.loadBlock(); + validIndexPrefix = 0; + term.length = 0; + + return stats; + } + + private Frame getFrame(int ord) throws IOException { + if (ord >= stack.length) { + final Frame[] next = new Frame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(stack, 0, next, 0, stack.length); + for(int stackOrd=stack.length;stackOrd<next.length;stackOrd++) { + next[stackOrd] = new Frame(stackOrd); + } + stack = next; + } + assert stack[ord].ord == ord; + return stack[ord]; + } + + private FST.Arc<BytesRef> getArc(int ord) { + if (ord >= arcs.length) { + @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<BytesRef>[] next = + new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, next, 0, arcs.length); + for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) { + next[arcOrd] = new FST.Arc<BytesRef>(); + } + arcs = next; + } + return arcs[ord]; + } + + @Override + public Comparator<BytesRef> getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + // Pushes a frame we seek'd to + Frame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length) throws IOException { + scratchReader.reset(frameData.bytes, frameData.offset, frameData.length); + final long code = scratchReader.readVLong(); + final long fpSeek = code >>> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; + final Frame f = getFrame(1+currentFrame.ord); + f.hasTerms = (code & BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0; + f.hasTermsOrig = f.hasTerms; + f.isFloor = (code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0; + if (f.isFloor) { + f.setFloorData(scratchReader, frameData); + } + pushFrame(arc, fpSeek, length); + + return f; + } + + // Pushes next'd frame or seek'd frame; we later + // lazy-load the frame only when needed + Frame pushFrame(FST.Arc<BytesRef> arc, long fp, int length) throws IOException { + final Frame f = getFrame(1+currentFrame.ord); + f.arc = arc; + if (f.fpOrig == fp && f.nextEnt != -1) { + //if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix); + if (f.prefix > targetBeforeCurrentLength) { + f.rewind(); + } else { + // if (DEBUG) { + // System.out.println(" skip rewind!"); + // } + } + assert length == f.prefix; + } else { + f.nextEnt = -1; + f.prefix = length; + f.state.termBlockOrd = 0; + f.fpOrig = f.fp = fp; + f.lastSubFP = -1; + // if (DEBUG) { + // final int sav = term.length; + // term.length = length; + // System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term)); + // term.length = sav; + // } + } + + return f; + } + + // asserts only + private boolean clearEOF() { + eof = false; + return true; + } + + // asserts only + private boolean setEOF() { + eof = true; + return true; + } + + @Override + public boolean seekExact(final BytesRef target, final boolean useCache) throws IOException { + + if (index == null) { + throw new IllegalStateException("terms index was not loaded"); + } + + if (term.bytes.length <= target.length) { + term.bytes = ArrayUtil.grow(term.bytes, 1+target.length); + } + + assert clearEOF(); + + // if (DEBUG) { + // System.out.println("\nBTTR.seekExact seg=" + segment + " target=" + fieldInfo.name + ":" + brToString(target) + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix); + // printSeekState(); + // } + + FST.Arc<BytesRef> arc; + int targetUpto; + BytesRef output; + + targetBeforeCurrentLength = currentFrame.ord; + + if (currentFrame != staticFrame) { + + // We are already seek'd; find the common + // prefix of new seek term vs current term and + // re-use the corresponding seek state. For + // example, if app first seeks to foobar, then + // seeks to foobaz, we can re-use the seek state + // for the first 5 bytes. + + // if (DEBUG) { + // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); + // } + + arc = arcs[0]; + assert arc.isFinal(); + output = arc.output; + targetUpto = 0; + + Frame lastFrame = stack[0]; + assert validIndexPrefix <= term.length; + + final int targetLimit = Math.min(target.length, validIndexPrefix); + + int cmp = 0; + + // TODO: reverse vLong byte order for better FST + // prefix output sharing + + // First compare up to valid seek frames: + while (targetUpto < targetLimit) { + cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + // if (DEBUG) { + // System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output); + // } + if (cmp != 0) { + break; + } + arc = arcs[1+targetUpto]; + //if (arc.label != (target.bytes[target.offset + targetUpto] & 0xFF)) { + //System.out.println("FAIL: arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF)); + //} + assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + if (arc.output != NO_OUTPUT) { + output = fstOutputs.add(output, arc.output); + } + if (arc.isFinal()) { + lastFrame = stack[1+lastFrame.ord]; + } + targetUpto++; + } + + if (cmp == 0) { + final int targetUptoMid = targetUpto; + + // Second compare the rest of the term, but + // don't save arc/output/frame; we only do this + // to find out if the target term is before, + // equal or after the current term + final int targetLimit2 = Math.min(target.length, term.length); + while (targetUpto < targetLimit2) { + cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + // if (DEBUG) { + // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); + // } + if (cmp != 0) { + break; + } + targetUpto++; + } + + if (cmp == 0) { + cmp = term.length - target.length; + } + targetUpto = targetUptoMid; + } + + if (cmp < 0) { + // Common case: target term is after current + // term, ie, app is seeking multiple terms + // in sorted order + // if (DEBUG) { + // System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + + } else if (cmp > 0) { + // Uncommon case: target term + // is before current term; this means we can + // keep the currentFrame but we must rewind it + // (so we scan from the start) + targetBeforeCurrentLength = 0; + // if (DEBUG) { + // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + currentFrame.rewind(); + } else { + // Target is exactly the same as current term + assert term.length == target.length; + if (termExists) { + // if (DEBUG) { + // System.out.println(" target is same as current; return true"); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" target is same as current but term doesn't exist"); + // } + } + //validIndexPrefix = currentFrame.depth; + //term.length = target.length; + //return termExists; + } + + } else { + + targetBeforeCurrentLength = -1; + arc = index.getFirstArc(arcs[0]); + + // Empty string prefix must have an output (block) in the index! + assert arc.isFinal(); + assert arc.output != null; + + // if (DEBUG) { + // System.out.println(" no seek state; push root frame"); + // } + + output = arc.output; + + currentFrame = staticFrame; + + //term.length = 0; + targetUpto = 0; + currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0); + } + + // if (DEBUG) { + // System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength); + // } + + while (targetUpto < target.length) { + + final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; + + final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); + + if (nextArc == null) { + + // Index is exhausted + // if (DEBUG) { + // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel)); + // } + + validIndexPrefix = currentFrame.prefix; + //validIndexPrefix = targetUpto; + + currentFrame.scanToFloorFrame(target); + + if (!currentFrame.hasTerms) { + termExists = false; + term.bytes[targetUpto] = (byte) targetLabel; + term.length = 1+targetUpto; + // if (DEBUG) { + // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); + // } + return false; + } + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, true); + if (result == SeekStatus.FOUND) { + // if (DEBUG) { + // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" got " + result + "; return NOT_FOUND term=" + brToString(term)); + // } + return false; + } + } else { + // Follow this arc + arc = nextArc; + term.bytes[targetUpto] = (byte) targetLabel; + // Aggregate output as we go: + assert arc.output != null; + if (arc.output != NO_OUTPUT) { + output = fstOutputs.add(output, arc.output); + } + + // if (DEBUG) { + // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); + // } + targetUpto++; + + if (arc.isFinal()) { + //if (DEBUG) System.out.println(" arc is final!"); + currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto); + //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms); + } + } + } + + //validIndexPrefix = targetUpto; + validIndexPrefix = currentFrame.prefix; + + currentFrame.scanToFloorFrame(target); + + // Target term is entirely contained in the index: + if (!currentFrame.hasTerms) { + termExists = false; + term.length = targetUpto; + // if (DEBUG) { + // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); + // } + return false; + } + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, true); + if (result == SeekStatus.FOUND) { + // if (DEBUG) { + // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" got result " + result + "; return NOT_FOUND term=" + term.utf8ToString()); + // } + + return false; + } + } + + @Override + public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException { + if (index == null) { + throw new IllegalStateException("terms index was not loaded"); + } + + if (term.bytes.length <= target.length) { + term.bytes = ArrayUtil.grow(term.bytes, 1+target.length); + } + + assert clearEOF(); + + //if (DEBUG) { + //System.out.println("\nBTTR.seekCeil seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix); + //printSeekState(); + //} + + FST.Arc<BytesRef> arc; + int targetUpto; + BytesRef output; + + targetBeforeCurrentLength = currentFrame.ord; + + if (currentFrame != staticFrame) { + + // We are already seek'd; find the common + // prefix of new seek term vs current term and + // re-use the corresponding seek state. For + // example, if app first seeks to foobar, then + // seeks to foobaz, we can re-use the seek state + // for the first 5 bytes. + + //if (DEBUG) { + //System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); + //} + + arc = arcs[0]; + assert arc.isFinal(); + output = arc.output; + targetUpto = 0; + + Frame lastFrame = stack[0]; + assert validIndexPrefix <= term.length; + + final int targetLimit = Math.min(target.length, validIndexPrefix); + + int cmp = 0; + + // TOOD: we should write our vLong backwards (MSB + // first) to get better sharing from the FST + + // First compare up to valid seek frames: + while (targetUpto < targetLimit) { + cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + //if (DEBUG) { + //System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output); + //} + if (cmp != 0) { + break; + } + arc = arcs[1+targetUpto]; + assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + // TOOD: we could save the outputs in local + // byte[][] instead of making new objs ever + // seek; but, often the FST doesn't have any + // shared bytes (but this could change if we + // reverse vLong byte order) + if (arc.output != NO_OUTPUT) { + output = fstOutputs.add(output, arc.output); + } + if (arc.isFinal()) { + lastFrame = stack[1+lastFrame.ord]; + } + targetUpto++; + } + + + if (cmp == 0) { + final int targetUptoMid = targetUpto; + // Second compare the rest of the term, but + // don't save arc/output/frame: + final int targetLimit2 = Math.min(target.length, term.length); + while (targetUpto < targetLimit2) { + cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + //if (DEBUG) { + //System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); + //} + if (cmp != 0) { + break; + } + targetUpto++; + } + + if (cmp == 0) { + cmp = term.length - target.length; + } + targetUpto = targetUptoMid; + } + + if (cmp < 0) { + // Common case: target term is after current + // term, ie, app is seeking multiple terms + // in sorted order + //if (DEBUG) { + //System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); clear frame.scanned ord=" + lastFrame.ord); + //} + currentFrame = lastFrame; + + } else if (cmp > 0) { + // Uncommon case: target term + // is before current term; this means we can + // keep the currentFrame but we must rewind it + // (so we scan from the start) + targetBeforeCurrentLength = 0; + //if (DEBUG) { + //System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord); + //} + currentFrame = lastFrame; + currentFrame.rewind(); + } else { + // Target is exactly the same as current term + assert term.length == target.length; + if (termExists) { + //if (DEBUG) { + //System.out.println(" target is same as current; return FOUND"); + //} + return SeekStatus.FOUND; + } else { + //if (DEBUG) { + //System.out.println(" target is same as current but term doesn't exist"); + //} + } + } + + } else { + + targetBeforeCurrentLength = -1; + arc = index.getFirstArc(arcs[0]); + + // Empty string prefix must have an output (block) in the index! + assert arc.isFinal(); + assert arc.output != null; + + //if (DEBUG) { + //System.out.println(" no seek state; push root frame"); + //} + + output = arc.output; + + currentFrame = staticFrame; + + //term.length = 0; + targetUpto = 0; + currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0); + } + + //if (DEBUG) { + //System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength); + //} + + while (targetUpto < target.length) { + + final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; + + final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); + + if (nextArc == null) { + + // Index is exhausted + // if (DEBUG) { + // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel)); + // } + + validIndexPrefix = currentFrame.prefix; + //validIndexPrefix = targetUpto; + + currentFrame.scanToFloorFrame(target); + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, false); + if (result == SeekStatus.END) { + term.copyBytes(target); + termExists = false; + + if (next() != null) { + //if (DEBUG) { + //System.out.println(" return NOT_FOUND term=" + brToString(term) + " " + term); + //} + return SeekStatus.NOT_FOUND; + } else { + //if (DEBUG) { + //System.out.println(" return END"); + //} + return SeekStatus.END; + } + } else { + //if (DEBUG) { + //System.out.println(" return " + result + " term=" + brToString(term) + " " + term); + //} + return result; + } + } else { + // Follow this arc + term.bytes[targetUpto] = (byte) targetLabel; + arc = nextArc; + // Aggregate output as we go: + assert arc.output != null; + if (arc.output != NO_OUTPUT) { + output = fstOutputs.add(output, arc.output); + } + + //if (DEBUG) { + //System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); + //} + targetUpto++; + + if (arc.isFinal()) { + //if (DEBUG) System.out.println(" arc is final!"); + currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto); + //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms); + } + } + } + + //validIndexPrefix = targetUpto; + validIndexPrefix = currentFrame.prefix; + + currentFrame.scanToFloorFrame(target); + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, false); + + if (result == SeekStatus.END) { + term.copyBytes(target); + termExists = false; + if (next() != null) { + //if (DEBUG) { + //System.out.println(" return NOT_FOUND term=" + term.utf8ToString() + " " + term); + //} + return SeekStatus.NOT_FOUND; + } else { + //if (DEBUG) { + //System.out.println(" return END"); + //} + return SeekStatus.END; + } + } else { + return result; + } + } + + @SuppressWarnings("unused") + private void printSeekState(PrintStream out) throws IOException { + if (currentFrame == staticFrame) { + out.println(" no prior seek"); + } else { + out.println(" prior seek state:"); + int ord = 0; + boolean isSeekFrame = true; + while(true) { + Frame f = getFrame(ord); + assert f != null; + final BytesRef prefix = new BytesRef(term.bytes, 0, f.prefix); + if (f.nextEnt == -1) { + out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); + } else { + out.println(" frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); + } + if (index != null) { + assert !isSeekFrame || f.arc != null: "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc; + if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.bytes[f.prefix-1]&0xFF)) { + out.println(" broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.bytes[f.prefix-1]&0xFF)); + throw new RuntimeException("seek state is broken"); + } + BytesRef output = Util.get(index, prefix); + if (output == null) { + out.println(" broken seek state: prefix is not final in index"); + throw new RuntimeException("seek state is broken"); + } else if (isSeekFrame && !f.isFloor) { + final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length); + final long codeOrig = reader.readVLong(); + final long code = (f.fp << BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) | (f.isFloor ? BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0); + if (codeOrig != code) { + out.println(" broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code); + throw new RuntimeException("seek state is broken"); + } + } + } + if (f == currentFrame) { + break; + } + if (f.prefix == validIndexPrefix) { + isSeekFrame = false; + } + ord++; + } + } + } + + /* Decodes only the term bytes of the next term. If caller then asks for + metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) + decode all metadata up to the current term. */ + @Override + public BytesRef next() throws IOException { + + if (in == null) { + // Fresh TermsEnum; seek to first term: + final FST.Arc<BytesRef> arc; + if (index != null) { + arc = index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + currentFrame = pushFrame(arc, rootCode, 0); + currentFrame.loadBlock(); + } + + targetBeforeCurrentLength = currentFrame.ord; + + assert !eof; + //if (DEBUG) { + //System.out.println("\nBTTR.next seg=" + segment + " term=" + brToString(term) + " termExists?=" + termExists + " field=" + fieldInfo.name + " termBlockOrd=" + currentFrame.state.termBlockOrd + " validIndexPrefix=" + validIndexPrefix); + //printSeekState(); + //} + + if (currentFrame == staticFrame) { + // If seek was previously called and the term was + // cached, or seek(TermState) was called, usually + // caller is just going to pull a D/&PEnum or get + // docFreq, etc. But, if they then call next(), + // this method catches up all internal state so next() + // works properly: + //if (DEBUG) System.out.println(" re-seek to pending term=" + term.utf8ToString() + " " + term); + final boolean result = seekExact(term, false); + assert result; + } + + // Pop finished blocks + while (currentFrame.nextEnt == currentFrame.entCount) { + if (!currentFrame.isLastInFloor) { + currentFrame.loadNextFloorBlock(); + } else { + //if (DEBUG) System.out.println(" pop frame"); + if (currentFrame.ord == 0) { + //if (DEBUG) System.out.println(" return null"); + assert setEOF(); + term.length = 0; + validIndexPrefix = 0; + currentFrame.rewind(); + termExists = false; + return null; + } + final long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord-1]; + + if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) { + // We popped into a frame that's not loaded + // yet or not scan'd to the right entry + currentFrame.scanToFloorFrame(term); + currentFrame.loadBlock(); + currentFrame.scanToSubBlock(lastFP); + } + + // Note that the seek state (last seek) has been + // invalidated beyond this depth + validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix); + //if (DEBUG) { + //System.out.println(" reset validIndexPrefix=" + validIndexPrefix); + //} + } + } + + while(true) { + if (currentFrame.next()) { + // Push to new block: + //if (DEBUG) System.out.println(" push frame"); + currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length); + // This is a "next" frame -- even if it's + // floor'd we must pretend it isn't so we don't + // try to scan to the right floor frame: + currentFrame.isFloor = false; + //currentFrame.hasTerms = true; + currentFrame.loadBlock(); + } else { + //if (DEBUG) System.out.println(" return term=" + term.utf8ToString() + " " + term + " currentFrame.ord=" + currentFrame.ord); + return term; + } + } + } + + @Override + public BytesRef term() { + assert !eof; + return term; + } + + @Override + public int docFreq() throws IOException { + assert !eof; + //if (DEBUG) System.out.println("BTR.docFreq"); + currentFrame.decodeMetaData(); + //if (DEBUG) System.out.println(" return " + currentFrame.state.docFreq); + return currentFrame.state.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + assert !eof; + currentFrame.decodeMetaData(); + return currentFrame.state.totalTermFreq; + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse, int flags) throws IOException { + assert !eof; + //if (DEBUG) { + //System.out.println("BTTR.docs seg=" + segment); +
<TRUNCATED>