This is an automated email from the ASF dual-hosted git repository. nightowl888 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit 41173647255b100100ba13e182a1dac9e05c76f0 Author: Shad Storhaug <[email protected]> AuthorDate: Thu Aug 27 12:11:05 2020 +0700 Codecs: Added MoveNext() implementations to each TermsEnum subclass (see #279, #212) --- .../BlockTerms/BlockTermsReader.cs | 58 ++++ .../Bloom/BloomFilteringPostingsFormat.cs | 5 + .../Memory/DirectPostingsFormat.cs | 310 +++++++++++++++++++++ src/Lucene.Net.Codecs/Memory/FSTOrdTermsReader.cs | 75 ++++- src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs | 73 +++++ .../Memory/MemoryDocValuesProducer.cs | 5 + .../Memory/MemoryPostingsFormat.cs | 14 + .../SimpleText/SimpleTextFieldsReader.cs | 16 ++ .../SimpleText/SimpleTextTermVectorsReader.cs | 23 +- .../MemoryIndex.MemoryIndexReader.cs | 15 + .../Index/Sorter/SortingAtomicReader.cs | 2 +- .../Codecs/RAMOnly/RAMOnlyPostingsFormat.cs | 42 ++- .../Index/AssertingAtomicReader.cs | 17 ++ .../Index/TestFilterAtomicReader.cs | 13 + src/Lucene.Net.Tests/Search/TestRegexpRandom2.cs | 2 +- src/Lucene.Net/Codecs/BlockTreeTermsReader.cs | 283 +++++++++++++++++++ .../Compressing/CompressingTermVectorsReader.cs | 25 ++ src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs | 82 ++++++ .../Codecs/Lucene3x/Lucene3xTermVectorsReader.cs | 13 +- .../Codecs/Lucene40/Lucene40TermVectorsReader.cs | 72 ++++- .../Codecs/Lucene42/Lucene42DocValuesProducer.cs | 3 + .../Codecs/Lucene45/Lucene45DocValuesProducer.cs | 20 +- src/Lucene.Net/Index/DocTermOrds.cs | 16 ++ src/Lucene.Net/Index/FilterAtomicReader.cs | 6 + src/Lucene.Net/Index/FilteredTermsEnum.cs | 59 +++- src/Lucene.Net/Index/MultiTermsEnum.cs | 33 +++ src/Lucene.Net/Index/SortedDocValuesTermsEnum.cs | 12 + .../Index/SortedSetDocValuesTermsEnum.cs | 12 + src/Lucene.Net/Index/Terms.cs | 2 +- src/Lucene.Net/Index/TermsEnum.cs | 22 +- src/Lucene.Net/Search/FuzzyTermsEnum.cs | 27 ++ src/Lucene.Net/Util/Fst/BytesRefFSTEnum.cs | 18 ++ 32 files changed, 1333 insertions(+), 42 deletions(-) diff --git a/src/Lucene.Net.Codecs/BlockTerms/BlockTermsReader.cs b/src/Lucene.Net.Codecs/BlockTerms/BlockTermsReader.cs index f5e1f6c..269b0e3 100644 --- a/src/Lucene.Net.Codecs/BlockTerms/BlockTermsReader.cs +++ b/src/Lucene.Net.Codecs/BlockTerms/BlockTermsReader.cs @@ -654,6 +654,64 @@ namespace Lucene.Net.Codecs.BlockTerms } } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + //System.out.println("BTR.next() seekPending=" + seekPending + " pendingSeekCount=" + state.termBlockOrd); + + // If seek was previously called and the term was cached, + // usually caller is just going to pull a D/&PEnum or get + // docFreq, etc. But, if they then call next(), + // this method catches up all internal state so next() + // works properly: + if (seekPending) + { + if (Debugging.AssertsEnabled) Debugging.Assert(!indexIsCurrent); + input.Seek(state.BlockFilePointer); + int pendingSeekCount = state.TermBlockOrd; + bool result = NextBlock(); + + long savOrd = state.Ord; + + // Block must exist since seek(TermState) was called w/ a + // TermState previously returned by this enum when positioned + // on a real term: + if (Debugging.AssertsEnabled) Debugging.Assert(result); + + while (state.TermBlockOrd < pendingSeekCount) + { + BytesRef nextResult = _next(); + if (Debugging.AssertsEnabled) Debugging.Assert(nextResult != null); + } + seekPending = false; + state.Ord = savOrd; + } + //System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + state.termBlockOrd + " (vs " + blockTermCount + ")"); + if (state.TermBlockOrd == blockTermCount && !NextBlock()) + { + //System.out.println(" eof"); + indexIsCurrent = false; + return false; + } + + // TODO: cutover to something better for these ints! simple64? + int suffix = termSuffixesReader.ReadVInt32(); + //System.out.println(" suffix=" + suffix); + + term.Length = termBlockPrefix + suffix; + if (term.Bytes.Length < term.Length) + { + term.Grow(term.Length); + } + termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); + state.TermBlockOrd++; + + // NOTE: meaningless in the non-ord case + state.Ord++; + + return true; + } + public override BytesRef Next() { //System.out.println("BTR.next() seekPending=" + seekPending + " pendingSeekCount=" + state.termBlockOrd); diff --git a/src/Lucene.Net.Codecs/Bloom/BloomFilteringPostingsFormat.cs b/src/Lucene.Net.Codecs/Bloom/BloomFilteringPostingsFormat.cs index 9f4cc05..2124ce8 100644 --- a/src/Lucene.Net.Codecs/Bloom/BloomFilteringPostingsFormat.cs +++ b/src/Lucene.Net.Codecs/Bloom/BloomFilteringPostingsFormat.cs @@ -291,6 +291,11 @@ namespace Lucene.Net.Codecs.Bloom // (clone IndexInput) delegateTermsEnum ?? (delegateTermsEnum = _delegateTerms.GetIterator(_reuseDelegate)); + public override bool MoveNext() // LUCENENET specific - shortcut for better enumerator optimization + { + return Delegate.MoveNext(); + } + public override sealed BytesRef Next() { return Delegate.Next(); diff --git a/src/Lucene.Net.Codecs/Memory/DirectPostingsFormat.cs b/src/Lucene.Net.Codecs/Memory/DirectPostingsFormat.cs index be7e794..114d77d 100644 --- a/src/Lucene.Net.Codecs/Memory/DirectPostingsFormat.cs +++ b/src/Lucene.Net.Codecs/Memory/DirectPostingsFormat.cs @@ -864,6 +864,21 @@ namespace Lucene.Net.Codecs.Memory public override IComparer<BytesRef> Comparer => BytesRef.UTF8SortedAsUnicodeComparer; + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + termOrd++; + if (termOrd < outerInstance.terms.Length) + { + SetTerm(); + return true; + } + else + { + return false; + } + } + public override BytesRef Next() { termOrd++; @@ -1377,6 +1392,301 @@ namespace Lucene.Net.Codecs.Memory } } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + // if (DEBUG) { + // System.out.println("\nIE.next"); + // } + + termOrd++; + int skipUpto = 0; + + if (termOrd == 0 && outerInstance.termOffsets[1] == 0) + { + // Special-case empty string: + if (Debugging.AssertsEnabled) Debugging.Assert(stateUpto == 0); + // if (DEBUG) { + // System.out.println(" visit empty string"); + // } + if (runAutomaton.IsAccept(states[0].state)) + { + scratch.Bytes = outerInstance.termBytes; + scratch.Offset = 0; + scratch.Length = 0; + return true; + } + termOrd++; + } + + + while (true) + { + // if (DEBUG) { + // System.out.println(" cycle termOrd=" + termOrd + " stateUpto=" + stateUpto + " skipUpto=" + skipUpto); + // } + if (termOrd == outerInstance.terms.Length) + { + // if (DEBUG) { + // System.out.println(" return END"); + // } + return false; + } + + State state = states[stateUpto]; + if (termOrd == state.changeOrd) + { + // Pop: + // if (DEBUG) { + // System.out.println(" pop stateUpto=" + stateUpto); + // } + stateUpto--; + + continue; + } + + int termOffset = outerInstance.termOffsets[termOrd]; + int termLength = outerInstance.termOffsets[termOrd + 1] - termOffset; + int skipOffset = outerInstance.skipOffsets[termOrd]; + int numSkips = outerInstance.skipOffsets[termOrd + 1] - skipOffset; + + // if (DEBUG) { + // System.out.println(" term=" + new BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" + Arrays.toString(skips)); + // } + + if (Debugging.AssertsEnabled) Debugging.Assert(termOrd < state.changeOrd); + + if (Debugging.AssertsEnabled) Debugging.Assert(stateUpto <= termLength, () => "term.length=" + termLength + "; stateUpto=" + stateUpto); + int label = outerInstance.termBytes[termOffset + stateUpto] & 0xFF; + + while (label > state.transitionMax) + { + //System.out.println(" label=" + label + " vs max=" + state.transitionMax + " transUpto=" + state.transitionUpto + " vs " + state.transitions.length); + state.transitionUpto++; + if (state.transitionUpto == state.transitions.Length) + { + // We've exhausted transitions leaving this + // state; force pop+next/skip now: + //System.out.println("forcepop: stateUpto=" + stateUpto); + if (stateUpto == 0) + { + termOrd = outerInstance.terms.Length; + return false; + } + else + { + if (Debugging.AssertsEnabled) Debugging.Assert(state.changeOrd > termOrd); + // if (DEBUG) { + // System.out.println(" jumpend " + (state.changeOrd - termOrd)); + // } + //System.out.println(" jump to termOrd=" + states[stateUpto].changeOrd + " vs " + termOrd); + termOrd = states[stateUpto].changeOrd; + skipUpto = 0; + stateUpto--; + } + goto nextTermContinue; + } + if (Debugging.AssertsEnabled) Debugging.Assert(state.transitionUpto < state.transitions.Length, + () => " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitions.Length); + state.transitionMin = state.transitions[state.transitionUpto].Min; + state.transitionMax = state.transitions[state.transitionUpto].Max; + if (Debugging.AssertsEnabled) + { + Debugging.Assert(state.transitionMin >= 0); + Debugging.Assert(state.transitionMin <= 255); + Debugging.Assert(state.transitionMax >= 0); + Debugging.Assert(state.transitionMax <= 255); + } + } + + int targetLabel = state.transitionMin; + + if ((outerInstance.termBytes[termOffset + stateUpto] & 0xFF) < targetLabel) + { + // if (DEBUG) { + // System.out.println(" do bin search"); + // } + //int startTermOrd = termOrd; + int low = termOrd + 1; + int high = state.changeOrd - 1; + while (true) + { + if (low > high) + { + // Label not found + termOrd = low; + // if (DEBUG) { + // System.out.println(" advanced by " + (termOrd - startTermOrd)); + // } + //System.out.println(" jump " + (termOrd - startTermOrd)); + skipUpto = 0; + goto nextTermContinue; + } + int mid = (int)((uint)(low + high) >> 1); + int cmp = (outerInstance.termBytes[outerInstance.termOffsets[mid] + stateUpto] & 0xFF) - + targetLabel; + // if (DEBUG) { + // System.out.println(" bin: check label=" + (char) (termBytes[termOffsets[low] + stateUpto] & 0xFF) + " ord=" + mid); + // } + if (cmp < 0) + { + low = mid + 1; + } + else if (cmp > 0) + { + high = mid - 1; + } + else + { + // Label found; walk backwards to first + // occurrence: + while (mid > termOrd && + (outerInstance.termBytes[outerInstance.termOffsets[mid - 1] + stateUpto] & + 0xFF) == targetLabel) + { + mid--; + } + termOrd = mid; + // if (DEBUG) { + // System.out.println(" advanced by " + (termOrd - startTermOrd)); + // } + //System.out.println(" jump " + (termOrd - startTermOrd)); + skipUpto = 0; + goto nextTermContinue; + } + } + } + + int nextState = runAutomaton.Step(states[stateUpto].state, label); + + if (nextState == -1) + { + // Skip + // if (DEBUG) { + // System.out.println(" automaton doesn't accept; skip"); + // } + if (skipUpto < numSkips) + { + // if (DEBUG) { + // System.out.println(" jump " + (skips[skipOffset+skipUpto]-1 - termOrd)); + // } + termOrd = outerInstance.skips[skipOffset + skipUpto]; + } + else + { + termOrd++; + } + skipUpto = 0; + } + else if (skipUpto < numSkips) + { + Grow(); + stateUpto++; + states[stateUpto].state = nextState; + states[stateUpto].changeOrd = outerInstance.skips[skipOffset + skipUpto++]; + states[stateUpto].transitions = compiledAutomaton.SortedTransitions[nextState]; + states[stateUpto].transitionUpto = -1; + states[stateUpto].transitionMax = -1; + + if (stateUpto == termLength) + { + // if (DEBUG) { + // System.out.println(" term ends after push"); + // } + if (runAutomaton.IsAccept(nextState)) + { + // if (DEBUG) { + // System.out.println(" automaton accepts: return"); + // } + scratch.Bytes = outerInstance.termBytes; + scratch.Offset = outerInstance.termOffsets[termOrd]; + scratch.Length = outerInstance.termOffsets[1 + termOrd] - scratch.Offset; + // if (DEBUG) { + // System.out.println(" ret " + scratch.utf8ToString()); + // } + return true; + } + else + { + // if (DEBUG) { + // System.out.println(" automaton rejects: nextTerm"); + // } + termOrd++; + skipUpto = 0; + } + } + } + else + { + // Run the non-indexed tail of this term: + + // TODO: add assert that we don't inc too many times + + if (compiledAutomaton.CommonSuffixRef != null) + { + //System.out.println("suffix " + compiledAutomaton.commonSuffixRef.utf8ToString()); + if (Debugging.AssertsEnabled) Debugging.Assert(compiledAutomaton.CommonSuffixRef.Offset == 0); + if (termLength < compiledAutomaton.CommonSuffixRef.Length) + { + termOrd++; + skipUpto = 0; + goto nextTermContinue; + } + int offset = termOffset + termLength - compiledAutomaton.CommonSuffixRef.Length; + for (int suffix = 0; suffix < compiledAutomaton.CommonSuffixRef.Length; suffix++) + { + if (outerInstance.termBytes[offset + suffix] != + compiledAutomaton.CommonSuffixRef.Bytes[suffix]) + { + termOrd++; + skipUpto = 0; + goto nextTermContinue; + } + } + } + + int upto = stateUpto + 1; + while (upto < termLength) + { + nextState = runAutomaton.Step(nextState, outerInstance.termBytes[termOffset + upto] & 0xFF); + if (nextState == -1) + { + termOrd++; + skipUpto = 0; + // if (DEBUG) { + // System.out.println(" nomatch tail; next term"); + // } + goto nextTermContinue; + } + upto++; + } + + if (runAutomaton.IsAccept(nextState)) + { + scratch.Bytes = outerInstance.termBytes; + scratch.Offset = outerInstance.termOffsets[termOrd]; + scratch.Length = outerInstance.termOffsets[1 + termOrd] - scratch.Offset; + // if (DEBUG) { + // System.out.println(" match tail; return " + scratch.utf8ToString()); + // System.out.println(" ret2 " + scratch.utf8ToString()); + // } + return true; + } + else + { + termOrd++; + skipUpto = 0; + // if (DEBUG) { + // System.out.println(" nomatch tail; next term"); + // } + } + } + nextTermContinue:; + } + + //nextTermBreak: ; // LUCENENET NOTE: Not used + } + public override BytesRef Next() { // if (DEBUG) { diff --git a/src/Lucene.Net.Codecs/Memory/FSTOrdTermsReader.cs b/src/Lucene.Net.Codecs/Memory/FSTOrdTermsReader.cs index c34bd4c..56ed2fc 100644 --- a/src/Lucene.Net.Codecs/Memory/FSTOrdTermsReader.cs +++ b/src/Lucene.Net.Codecs/Memory/FSTOrdTermsReader.cs @@ -450,8 +450,6 @@ namespace Lucene.Net.Codecs.Memory // Iterates through all terms in this field private sealed class SegmentTermsEnum : BaseTermsEnum { - private readonly FSTOrdTermsReader.TermsReader outerInstance; - private readonly BytesRefFSTEnum<long?> fstEnum; /* True when current term's metadata is decoded */ @@ -462,7 +460,6 @@ namespace Lucene.Net.Codecs.Memory internal SegmentTermsEnum(FSTOrdTermsReader.TermsReader outerInstance) : base(outerInstance) { - this.outerInstance = outerInstance; this.fstEnum = new BytesRefFSTEnum<long?>(outerInstance.index); this.decoded = false; this.seekPending = false; @@ -494,6 +491,33 @@ namespace Lucene.Net.Codecs.Memory seekPending = false; } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + if (seekPending) // previously positioned, but termOutputs not fetched + { + seekPending = false; + var status = SeekCeil(term); + if (Debugging.AssertsEnabled) Debugging.Assert(status == SeekStatus.FOUND); // must positioned on valid term + } + // LUCENENET specific - extracted logic of UpdateEnum() so we can eliminate the null check + var moved = fstEnum.MoveNext(); + if (moved) + { + var pair = fstEnum.Current; + term = pair.Input; + ord = pair.Output.Value; + DecodeStats(); + } + else + { + term = null; + } + decoded = false; + seekPending = false; + return moved; + } + public override BytesRef Next() { if (seekPending) // previously positioned, but termOutputs not fetched @@ -641,6 +665,51 @@ namespace Lucene.Net.Codecs.Memory throw new NotSupportedException(); } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + //if (TEST) System.out.println("Enum next()"); + if (pending) + { + pending = false; + DecodeStats(); + return true; + } + decoded = false; + while (level > 0) + { + Frame frame = NewFrame(); + if (LoadExpandFrame(TopFrame(), frame) != null) // has valid target + { + PushFrame(frame); + if (IsAccept(frame)) // gotcha + { + break; + } + continue; // check next target + } + frame = PopFrame(); + while (level > 0) + { + if (LoadNextFrame(TopFrame(), frame) != null) // has valid sibling + { + PushFrame(frame); + if (IsAccept(frame)) // gotcha + { + goto DFSBreak; + } + goto DFSContinue; // check next target + } + frame = PopFrame(); + } + return false; + DFSContinue:; + } + DFSBreak: + DecodeStats(); + return true; + } + public override BytesRef Next() { //if (TEST) System.out.println("Enum next()"); diff --git a/src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs b/src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs index 38d6da7..cb00f18 100644 --- a/src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs +++ b/src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs @@ -363,6 +363,34 @@ namespace Lucene.Net.Codecs.Memory seekPending = false; } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + if (seekPending) // previously positioned, but termOutputs not fetched + { + seekPending = false; + SeekStatus status = SeekCeil(term); + if (Debugging.AssertsEnabled) Debugging.Assert(status == SeekStatus.FOUND); // must positioned on valid term + } + // LUCENENET specific - extracted logic of UpdateEnum() so we can eliminate the null check + var moved = fstEnum.MoveNext(); + if (moved) + { + var pair = fstEnum.Current; + term = pair.Input; + meta = pair.Output; + state.DocFreq = meta.docFreq; + state.TotalTermFreq = meta.totalTermFreq; + } + else + { + term = null; + } + decoded = false; + seekPending = false; + return moved; + } + public override BytesRef Next() { if (seekPending) // previously positioned, but termOutputs not fetched @@ -553,6 +581,51 @@ namespace Lucene.Net.Codecs.Memory } } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + //if (TEST) System.out.println("Enum next()"); + if (pending) + { + pending = false; + LoadMetaData(); + return true; + } + decoded = false; + while (level > 0) + { + Frame frame = NewFrame(); + if (LoadExpandFrame(TopFrame(), frame) != null) // has valid target + { + PushFrame(frame); + if (IsAccept(frame)) // gotcha + { + break; + } + continue; // check next target + } + frame = PopFrame(); + while (level > 0) + { + if (LoadNextFrame(TopFrame(), frame) != null) // has valid sibling + { + PushFrame(frame); + if (IsAccept(frame)) // gotcha + { + goto DFSBreak; + } + goto DFSContinue; // check next target + } + frame = PopFrame(); + } + return false; + DFSContinue:; + } + DFSBreak: + LoadMetaData(); + return true; + } + public override BytesRef Next() { //if (TEST) System.out.println("Enum next()"); diff --git a/src/Lucene.Net.Codecs/Memory/MemoryDocValuesProducer.cs b/src/Lucene.Net.Codecs/Memory/MemoryDocValuesProducer.cs index 535844f..d7cf759 100644 --- a/src/Lucene.Net.Codecs/Memory/MemoryDocValuesProducer.cs +++ b/src/Lucene.Net.Codecs/Memory/MemoryDocValuesProducer.cs @@ -764,6 +764,11 @@ namespace Lucene.Net.Codecs.Memory bytesReader = fst.GetBytesReader(); } + public override bool MoveNext() + { + return input.MoveNext(); + } + public override BytesRef Next() { var io = input.Next(); diff --git a/src/Lucene.Net.Codecs/Memory/MemoryPostingsFormat.cs b/src/Lucene.Net.Codecs/Memory/MemoryPostingsFormat.cs index 1bf2067..ba87d4e 100644 --- a/src/Lucene.Net.Codecs/Memory/MemoryPostingsFormat.cs +++ b/src/Lucene.Net.Codecs/Memory/MemoryPostingsFormat.cs @@ -850,6 +850,20 @@ namespace Lucene.Net.Codecs.Memory public override BytesRef Term => current.Input; + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + //System.out.println("te.next"); + if (fstEnum.MoveNext()) + { + didDecode = false; + //System.out.println(" term=" + field.name + ":" + current.input.utf8ToString()); + return true; + } + //System.out.println(" END"); + return false; + } + public override BytesRef Next() { //System.out.println("te.next"); diff --git a/src/Lucene.Net.Codecs/SimpleText/SimpleTextFieldsReader.cs b/src/Lucene.Net.Codecs/SimpleText/SimpleTextFieldsReader.cs index 9a2a97b..8550bf0 100644 --- a/src/Lucene.Net.Codecs/SimpleText/SimpleTextFieldsReader.cs +++ b/src/Lucene.Net.Codecs/SimpleText/SimpleTextFieldsReader.cs @@ -157,6 +157,22 @@ namespace Lucene.Net.Codecs.SimpleText } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + //if (Debugging.AssertsEnabled) Debugging.Assert(!ended); // LUCENENET: Ended field is never set, so this can never fail + var result = _fstEnum.MoveNext(); + + if (!result) return false; + + var pair1 = _fstEnum.Current.Output; + var pair2 = pair1.Output2; + _docsStart = pair1.Output1.Value; + _docFreq = (int)pair2.Output1; + _totalTermFreq = pair2.Output2.Value; + return true; + } + public override BytesRef Next() { //if (Debugging.AssertsEnabled) Debugging.Assert(!ended); // LUCENENET: Ended field is never set, so this can never fail diff --git a/src/Lucene.Net.Codecs/SimpleText/SimpleTextTermVectorsReader.cs b/src/Lucene.Net.Codecs/SimpleText/SimpleTextTermVectorsReader.cs index edbeccb..0b23bd6 100644 --- a/src/Lucene.Net.Codecs/SimpleText/SimpleTextTermVectorsReader.cs +++ b/src/Lucene.Net.Codecs/SimpleText/SimpleTextTermVectorsReader.cs @@ -370,16 +370,15 @@ namespace Lucene.Net.Codecs.SimpleText _iterator = newTerms.GetEnumerator(); // LUCENENET specific: Since in .NET we don't have a HasNext() method, we need - // to call Next() and check the result if it is null instead. Since we need - // to check the result of Next() anyway for the Equals() comparison, this makes sense here. - var next = Next(); - if (next == null) + // to call MoveNext(). Since we need + // to check the result anyway for the Equals() comparison, this makes sense here. + if (!MoveNext()) { return SeekStatus.END; } else { - return next.Equals(text) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND; + return _current.Key.Equals(text) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND; } } @@ -388,6 +387,20 @@ namespace Lucene.Net.Codecs.SimpleText throw new NotSupportedException(); } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + if (_iterator.MoveNext()) + { + _current = _iterator.Current; + return true; + } + else + { + return false; + } + } + public override BytesRef Next() { if (_iterator.MoveNext()) diff --git a/src/Lucene.Net.Memory/MemoryIndex.MemoryIndexReader.cs b/src/Lucene.Net.Memory/MemoryIndex.MemoryIndexReader.cs index 6f6d3fc..df0161f 100644 --- a/src/Lucene.Net.Memory/MemoryIndex.MemoryIndexReader.cs +++ b/src/Lucene.Net.Memory/MemoryIndex.MemoryIndexReader.cs @@ -289,6 +289,21 @@ namespace Lucene.Net.Index.Memory termUpto = (int)ord; } + // LUCENEENT specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + termUpto++; + if (termUpto >= info.terms.Count) + { + return false; + } + else + { + info.terms.Get(info.sortedTerms[termUpto], br); + return true; + } + } + public override BytesRef Next() { termUpto++; diff --git a/src/Lucene.Net.Misc/Index/Sorter/SortingAtomicReader.cs b/src/Lucene.Net.Misc/Index/Sorter/SortingAtomicReader.cs index bb6eace..266f233 100644 --- a/src/Lucene.Net.Misc/Index/Sorter/SortingAtomicReader.cs +++ b/src/Lucene.Net.Misc/Index/Sorter/SortingAtomicReader.cs @@ -118,7 +118,7 @@ namespace Lucene.Net.Index.Sorter { private readonly SortingTermsEnum outerInstance; - private IBits liveDocs; + private readonly IBits liveDocs; public BitsAnonymousInnerClassHelper(SortingTermsEnum outerInstance, IBits liveDocs) { diff --git a/src/Lucene.Net.TestFramework/Codecs/RAMOnly/RAMOnlyPostingsFormat.cs b/src/Lucene.Net.TestFramework/Codecs/RAMOnly/RAMOnlyPostingsFormat.cs index f39a807..f66e9a3 100644 --- a/src/Lucene.Net.TestFramework/Codecs/RAMOnly/RAMOnlyPostingsFormat.cs +++ b/src/Lucene.Net.TestFramework/Codecs/RAMOnly/RAMOnlyPostingsFormat.cs @@ -359,20 +359,24 @@ namespace Lucene.Net.Codecs.RAMOnly public override IComparer<BytesRef> Comparer => BytesRef.UTF8SortedAsUnicodeComparer; - public override BytesRef Next() + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() { - if (it == null) + EnsureEnumeratorInitialized(); + if (it.MoveNext()) { - if (current == null) - { - it = ramField.termToDocs.Keys.GetEnumerator(); - } - else - { - //It = RamField.TermToDocs.tailMap(Current).Keys.GetEnumerator(); - it = ramField.termToDocs.Where(kvpair => string.CompareOrdinal(kvpair.Key, current) >= 0).ToDictionary(kvpair => kvpair.Key, kvpair => kvpair.Value).Keys.GetEnumerator(); - } + current = it.Current; + return true; } + else + { + return false; + } + } + + public override BytesRef Next() + { + EnsureEnumeratorInitialized(); if (it.MoveNext()) { current = it.Current; @@ -384,6 +388,22 @@ namespace Lucene.Net.Codecs.RAMOnly } } + private void EnsureEnumeratorInitialized() // LUCENENET specific - factored out initialization step + { + if (it is null) + { + if (current is null) + { + it = ramField.termToDocs.Keys.GetEnumerator(); + } + else + { + //It = RamField.TermToDocs.tailMap(Current).Keys.GetEnumerator(); + it = ramField.termToDocs.Where(kvpair => string.CompareOrdinal(kvpair.Key, current) >= 0).Select(pair => pair.Key).GetEnumerator(); + } + } + } + public override SeekStatus SeekCeil(BytesRef term) { current = term.Utf8ToString(); diff --git a/src/Lucene.Net.TestFramework/Index/AssertingAtomicReader.cs b/src/Lucene.Net.TestFramework/Index/AssertingAtomicReader.cs index 2d67f05..0219f86 100644 --- a/src/Lucene.Net.TestFramework/Index/AssertingAtomicReader.cs +++ b/src/Lucene.Net.TestFramework/Index/AssertingAtomicReader.cs @@ -435,6 +435,23 @@ namespace Lucene.Net.Index return docs == null ? null : new AssertingDocsAndPositionsEnum(docs); } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + if (Debugging.AssertsEnabled) Debugging.Assert(state == State.INITIAL || state == State.POSITIONED, "MoveNext() called on unpositioned TermsEnum"); + if (!base.MoveNext()) + { + state = State.UNPOSITIONED; + return false; + } + else + { + if (Debugging.AssertsEnabled) Debugging.Assert(base.Term.IsValid()); + state = State.POSITIONED; + return true; + } + } + // TODO: we should separately track if we are 'at the end' ? // someone should not call next() after it returns null!!!! public override BytesRef Next() diff --git a/src/Lucene.Net.Tests/Index/TestFilterAtomicReader.cs b/src/Lucene.Net.Tests/Index/TestFilterAtomicReader.cs index c1b59db..a5d50d7 100644 --- a/src/Lucene.Net.Tests/Index/TestFilterAtomicReader.cs +++ b/src/Lucene.Net.Tests/Index/TestFilterAtomicReader.cs @@ -76,6 +76,19 @@ namespace Lucene.Net.Index /// <summary> /// Scan for terms containing the letter 'e'. </summary> + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + while (m_input.MoveNext()) + { + if (m_input.Term.Utf8ToString().IndexOf('e') != -1) + return true; + } + return false; + } + + /// <summary> + /// Scan for terms containing the letter 'e'. </summary> public override BytesRef Next() { BytesRef text; diff --git a/src/Lucene.Net.Tests/Search/TestRegexpRandom2.cs b/src/Lucene.Net.Tests/Search/TestRegexpRandom2.cs index 6150bd0..2a76885 100644 --- a/src/Lucene.Net.Tests/Search/TestRegexpRandom2.cs +++ b/src/Lucene.Net.Tests/Search/TestRegexpRandom2.cs @@ -134,7 +134,7 @@ namespace Lucene.Net.Search private readonly TestRegexpRandom2.DumbRegexpQuery outerInstance; private CharacterRunAutomaton runAutomaton; - private CharsRef utf16 = new CharsRef(10); + private readonly CharsRef utf16 = new CharsRef(10); internal SimpleAutomatonTermsEnum(TestRegexpRandom2.DumbRegexpQuery outerInstance, TermsEnum tenum) : base(tenum) diff --git a/src/Lucene.Net/Codecs/BlockTreeTermsReader.cs b/src/Lucene.Net/Codecs/BlockTreeTermsReader.cs index 5d1b55b..b9c94b1 100644 --- a/src/Lucene.Net/Codecs/BlockTreeTermsReader.cs +++ b/src/Lucene.Net/Codecs/BlockTreeTermsReader.cs @@ -1270,6 +1270,179 @@ namespace Lucene.Net.Codecs if (Debugging.AssertsEnabled) Debugging.Assert(false); } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + // if (DEBUG) { + // System.out.println("\nintEnum.next seg=" + segment); + // System.out.println(" frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); + // } + + while (true) + { + // Pop finished frames + while (currentFrame.nextEnt == currentFrame.entCount) + { + if (!currentFrame.isLastInFloor) + { + //if (DEBUG) System.out.println(" next-floor-block"); + currentFrame.LoadNextFloorBlock(); + //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); + } + else + { + //if (DEBUG) System.out.println(" pop frame"); + if (currentFrame.ord == 0) + { + return false; + } + long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord - 1]; + if (Debugging.AssertsEnabled) Debugging.Assert(currentFrame.lastSubFP == lastFP); + //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); + } + } + + bool isSubBlock = currentFrame.Next(); + // if (DEBUG) { + // final BytesRef suffixRef = new BytesRef(); + // suffixRef.bytes = currentFrame.suffixBytes; + // suffixRef.offset = currentFrame.startBytePos; + // suffixRef.length = currentFrame.suffix; + // System.out.println(" " + (isSubBlock ? "sub-block" : "term") + " " + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" + brToString(suffixRef)); + // } + + if (currentFrame.suffix != 0) + { + int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff; + while (label > currentFrame.curTransitionMax) + { + if (currentFrame.transitionIndex >= currentFrame.transitions.Length - 1) + { + // Stop processing this frame -- no further + // matches are possible because we've moved + // beyond what the max transition will allow + //if (DEBUG) System.out.println(" break: trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex])); + + // sneaky! forces a pop above + currentFrame.isLastInFloor = true; + currentFrame.nextEnt = currentFrame.entCount; + goto nextTermContinue; + } + currentFrame.transitionIndex++; + currentFrame.curTransitionMax = currentFrame.transitions[currentFrame.transitionIndex].Max; + //if (DEBUG) System.out.println(" next trans=" + currentFrame.transitions[currentFrame.transitionIndex]); + } + } + + // First test the common suffix, if set: + if (compiledAutomaton.CommonSuffixRef != null && !isSubBlock) + { + int termLen = currentFrame.prefix + currentFrame.suffix; + if (termLen < compiledAutomaton.CommonSuffixRef.Length) + { + // No match + // if (DEBUG) { + // System.out.println(" skip: common suffix length"); + // } + goto nextTermContinue; + } + + byte[] suffixBytes = currentFrame.suffixBytes; + byte[] commonSuffixBytes = compiledAutomaton.CommonSuffixRef.Bytes; + + int lenInPrefix = compiledAutomaton.CommonSuffixRef.Length - currentFrame.suffix; + if (Debugging.AssertsEnabled) Debugging.Assert(compiledAutomaton.CommonSuffixRef.Offset == 0); + int suffixBytesPos; + int commonSuffixBytesPos = 0; + + if (lenInPrefix > 0) + { + // A prefix of the common suffix overlaps with + // the suffix of the block prefix so we first + // test whether the prefix part matches: + byte[] termBytes = term.Bytes; + int termBytesPos = currentFrame.prefix - lenInPrefix; + if (Debugging.AssertsEnabled) Debugging.Assert(termBytesPos >= 0); + int termBytesPosEnd = currentFrame.prefix; + while (termBytesPos < termBytesPosEnd) + { + if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) + { + // if (DEBUG) { + // System.out.println(" skip: common suffix mismatch (in prefix)"); + // } + goto nextTermContinue; + } + } + suffixBytesPos = currentFrame.startBytePos; + } + else + { + suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - compiledAutomaton.CommonSuffixRef.Length; + } + + // Test overlapping suffix part: + int commonSuffixBytesPosEnd = compiledAutomaton.CommonSuffixRef.Length; + while (commonSuffixBytesPos < commonSuffixBytesPosEnd) + { + if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) + { + // if (DEBUG) { + // System.out.println(" skip: common suffix mismatch"); + // } + goto nextTermContinue; + } + } + } + + // TODO: maybe we should do the same linear test + // that AutomatonTermsEnum does, so that if we + // reach a part of the automaton where .* is + // "temporarily" accepted, we just blindly .next() + // until the limit + + // See if the term prefix matches the automaton: + int state = currentFrame.state; + for (int idx = 0; idx < currentFrame.suffix; idx++) + { + state = runAutomaton.Step(state, currentFrame.suffixBytes[currentFrame.startBytePos + idx] & 0xff); + if (state == -1) + { + // No match + //System.out.println(" no s=" + state); + goto nextTermContinue; + } + else + { + //System.out.println(" c s=" + state); + } + } + + if (isSubBlock) + { + // Match! Recurse: + //if (DEBUG) System.out.println(" sub-block match to state=" + state + "; recurse fp=" + currentFrame.lastSubFP); + CopyTerm(); + currentFrame = PushFrame(state); + //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); + } + else if (runAutomaton.IsAccept(state)) + { + CopyTerm(); + //if (DEBUG) System.out.println(" term match to state=" + state + "; return term=" + brToString(term)); + if (Debugging.AssertsEnabled) Debugging.Assert(savedStartTerm == null || term.CompareTo(savedStartTerm) > 0, () => "saveStartTerm=" + savedStartTerm.Utf8ToString() + " term=" + term.Utf8ToString()); + return true; + } + else + { + //System.out.println(" no s=" + state); + } + nextTermContinue:; + } + //nextTermBreak:; + } + public override BytesRef Next() { // if (DEBUG) { @@ -2417,6 +2590,116 @@ namespace Lucene.Net.Codecs // } //} + + /* Decodes only the term bytes of the next term. If caller then asks for + metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) + decode all metadata up to the current term. */ + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + if (@in == null) + { + // Fresh TermsEnum; seek to first term: + FST.Arc<BytesRef> arc; + if (outerInstance.index != null) + { + arc = outerInstance.index.GetFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + if (Debugging.AssertsEnabled) Debugging.Assert(arc.IsFinal); + } + else + { + arc = null; + } + currentFrame = PushFrame(arc, outerInstance.rootCode, 0); + currentFrame.LoadBlock(); + } + + targetBeforeCurrentLength = currentFrame.ord; + + if (Debugging.AssertsEnabled) Debugging.Assert(!eof); + //if (DEBUG) { + //System.out.println("\nBTTR.next seg=" + segment + " term=" + brToString(term) + " termExists?=" + termExists + " field=" + fieldInfo.name + " termBlockOrd=" + currentFrame.state.termBlockOrd + " validIndexPrefix=" + validIndexPrefix); + //printSeekState(); + //} + + if (currentFrame == staticFrame) + { + // If seek was previously called and the term was + // cached, or seek(TermState) was called, usually + // caller is just going to pull a D/&PEnum or get + // docFreq, etc. But, if they then call next(), + // this method catches up all internal state so next() + // works properly: + //if (DEBUG) System.out.println(" re-seek to pending term=" + term.utf8ToString() + " " + term); + bool result = SeekExact(term); + if (Debugging.AssertsEnabled) Debugging.Assert(result); + } + + // Pop finished blocks + while (currentFrame.nextEnt == currentFrame.entCount) + { + if (!currentFrame.isLastInFloor) + { + currentFrame.LoadNextFloorBlock(); + } + else + { + //if (DEBUG) System.out.println(" pop frame"); + if (currentFrame.ord == 0) + { + //if (DEBUG) System.out.println(" return null"); + if (Debugging.AssertsEnabled) Debugging.Assert(SetEOF()); + term.Length = 0; + validIndexPrefix = 0; + currentFrame.Rewind(); + termExists = false; + return false; + } + long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord - 1]; + + if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) + { + // We popped into a frame that's not loaded + // yet or not scan'd to the right entry + currentFrame.ScanToFloorFrame(term); + currentFrame.LoadBlock(); + currentFrame.ScanToSubBlock(lastFP); + } + + // Note that the seek state (last seek) has been + // invalidated beyond this depth + validIndexPrefix = Math.Min(validIndexPrefix, currentFrame.prefix); + //if (DEBUG) { + //System.out.println(" reset validIndexPrefix=" + validIndexPrefix); + //} + } + } + + while (true) + { + if (currentFrame.Next()) + { + // Push to new block: + //if (DEBUG) System.out.println(" push frame"); + currentFrame = PushFrame(null, currentFrame.lastSubFP, term.Length); + // this is a "next" frame -- even if it's + // floor'd we must pretend it isn't so we don't + // try to scan to the right floor frame: + currentFrame.isFloor = false; + //currentFrame.hasTerms = true; + currentFrame.LoadBlock(); + } + else + { + //if (DEBUG) System.out.println(" return term=" + term.utf8ToString() + " " + term + " currentFrame.ord=" + currentFrame.ord); + return true; + } + } + } + + /* Decodes only the term bytes of the next term. If caller then asks for metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) decode all metadata up to the current term. */ diff --git a/src/Lucene.Net/Codecs/Compressing/CompressingTermVectorsReader.cs b/src/Lucene.Net/Codecs/Compressing/CompressingTermVectorsReader.cs index bce8eae..de4df91 100644 --- a/src/Lucene.Net/Codecs/Compressing/CompressingTermVectorsReader.cs +++ b/src/Lucene.Net/Codecs/Compressing/CompressingTermVectorsReader.cs @@ -835,6 +835,31 @@ namespace Lucene.Net.Codecs.Compressing ord = -1; } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + if (ord == numTerms - 1) + { + return false; + } + else + { + if (Debugging.AssertsEnabled) Debugging.Assert(ord < numTerms); + ++ord; + } + + // read term + term.Offset = 0; + term.Length = prefixLengths[ord] + suffixLengths[ord]; + if (term.Length > term.Bytes.Length) + { + term.Bytes = ArrayUtil.Grow(term.Bytes, term.Length); + } + @in.ReadBytes(term.Bytes, prefixLengths[ord], suffixLengths[ord]); + + return true; + } + public override BytesRef Next() { if (ord == numTerms - 1) diff --git a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs index b37a77d..8c05305 100644 --- a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs +++ b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs @@ -959,6 +959,88 @@ namespace Lucene.Net.Codecs.Lucene3x } } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + if (DEBUG_SURROGATES) + { + Console.WriteLine("TE.next()"); + } + if (skipNext) + { + if (DEBUG_SURROGATES) + { + Console.WriteLine(" skipNext=true"); + } + skipNext = false; + if (termEnum.Term() == null) + { + return false; + // PreFlex codec interns field names: + } + else if (termEnum.Term().Field != internedFieldName) + { + return false; + } + else + { + current = termEnum.Term().Bytes; + return true; + } + } + + // TODO: can we use STE's prevBuffer here? + prevTerm.CopyBytes(termEnum.Term().Bytes); + + if (termEnum.Next() && termEnum.Term().Field == internedFieldName) + { + newSuffixStart = termEnum.newSuffixStart; + if (DEBUG_SURROGATES) + { + Console.WriteLine(" newSuffixStart=" + newSuffixStart); + } + SurrogateDance(); + Term t = termEnum.Term(); + if (t == null || t.Field != internedFieldName) + { + // PreFlex codec interns field names; verify: + if (Debugging.AssertsEnabled) Debugging.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal)); + current = null; + return false; + } + else + { + current = t.Bytes; + return true; + } + } + else + { + // this field is exhausted, but we have to give + // surrogateDance a chance to seek back: + if (DEBUG_SURROGATES) + { + Console.WriteLine(" force cont"); + } + //newSuffixStart = prevTerm.length; + newSuffixStart = 0; + SurrogateDance(); + + Term t = termEnum.Term(); + if (t == null || t.Field != internedFieldName) + { + // PreFlex codec interns field names; verify: + if (Debugging.AssertsEnabled) Debugging.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal)); + return false; + } + else + { + current = t.Bytes; + return true; + } + } + } + public override BytesRef Next() { if (DEBUG_SURROGATES) diff --git a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xTermVectorsReader.cs b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xTermVectorsReader.cs index 8f3d99b..f3e0b8a 100644 --- a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xTermVectorsReader.cs +++ b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xTermVectorsReader.cs @@ -429,8 +429,6 @@ namespace Lucene.Net.Codecs.Lucene3x private class TVTermsEnum : TermsEnum { - private readonly Lucene3xTermVectorsReader outerInstance; - internal bool unicodeSortOrder; internal readonly IndexInput origTVF; internal readonly IndexInput tvf; @@ -444,7 +442,6 @@ namespace Lucene.Net.Codecs.Lucene3x // NOTE: tvf is pre-positioned by caller public TVTermsEnum(Lucene3xTermVectorsReader outerInstance) { - this.outerInstance = outerInstance; this.origTVF = outerInstance.tvf; tvf = (IndexInput)origTVF.Clone(); } @@ -549,6 +546,16 @@ namespace Lucene.Net.Codecs.Lucene3x throw new NotSupportedException(); } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + if (++currentTerm >= numTerms) + { + return false; + } + return true; + } + public override BytesRef Next() { if (++currentTerm >= numTerms) diff --git a/src/Lucene.Net/Codecs/Lucene40/Lucene40TermVectorsReader.cs b/src/Lucene.Net/Codecs/Lucene40/Lucene40TermVectorsReader.cs index 1b1a970..474bc8e 100644 --- a/src/Lucene.Net/Codecs/Lucene40/Lucene40TermVectorsReader.cs +++ b/src/Lucene.Net/Codecs/Lucene40/Lucene40TermVectorsReader.cs @@ -397,8 +397,6 @@ namespace Lucene.Net.Codecs.Lucene40 private class TVTermsEnum : TermsEnum { - private readonly Lucene40TermVectorsReader outerInstance; - private readonly IndexInput origTVF; private readonly IndexInput tvf; private int numTerms; @@ -424,7 +422,6 @@ namespace Lucene.Net.Codecs.Lucene40 // NOTE: tvf is pre-positioned by caller public TVTermsEnum(Lucene40TermVectorsReader outerInstance) { - this.outerInstance = outerInstance; this.origTVF = outerInstance.tvf; tvf = (IndexInput)origTVF.Clone(); } @@ -489,6 +486,75 @@ namespace Lucene.Net.Codecs.Lucene40 throw new NotSupportedException(); } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + if (nextTerm >= numTerms) + { + return false; + } + term.CopyBytes(lastTerm); + int start = tvf.ReadVInt32(); + int deltaLen = tvf.ReadVInt32(); + term.Length = start + deltaLen; + term.Grow(term.Length); + tvf.ReadBytes(term.Bytes, start, deltaLen); + freq = tvf.ReadVInt32(); + + if (storePayloads) + { + positions = new int[freq]; + payloadOffsets = new int[freq]; + int totalPayloadLength = 0; + int pos = 0; + for (int posUpto = 0; posUpto < freq; posUpto++) + { + int code = tvf.ReadVInt32(); + pos += (int)((uint)code >> 1); + positions[posUpto] = pos; + if ((code & 1) != 0) + { + // length change + lastPayloadLength = tvf.ReadVInt32(); + } + payloadOffsets[posUpto] = totalPayloadLength; + totalPayloadLength += lastPayloadLength; + if (Debugging.AssertsEnabled) Debugging.Assert(totalPayloadLength >= 0); + } + payloadData = new byte[totalPayloadLength]; + tvf.ReadBytes(payloadData, 0, payloadData.Length); + } // no payloads + else if (storePositions) + { + // TODO: we could maybe reuse last array, if we can + // somehow be careful about consumer never using two + // D&PEnums at once... + positions = new int[freq]; + int pos = 0; + for (int posUpto = 0; posUpto < freq; posUpto++) + { + pos += tvf.ReadVInt32(); + positions[posUpto] = pos; + } + } + + if (storeOffsets) + { + startOffsets = new int[freq]; + endOffsets = new int[freq]; + int offset = 0; + for (int posUpto = 0; posUpto < freq; posUpto++) + { + startOffsets[posUpto] = offset + tvf.ReadVInt32(); + offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.ReadVInt32(); + } + } + + lastTerm.CopyBytes(term); + nextTerm++; + return true; + } + public override BytesRef Next() { if (nextTerm >= numTerms) diff --git a/src/Lucene.Net/Codecs/Lucene42/Lucene42DocValuesProducer.cs b/src/Lucene.Net/Codecs/Lucene42/Lucene42DocValuesProducer.cs index b3c91e5..b675f1f 100644 --- a/src/Lucene.Net/Codecs/Lucene42/Lucene42DocValuesProducer.cs +++ b/src/Lucene.Net/Codecs/Lucene42/Lucene42DocValuesProducer.cs @@ -722,6 +722,9 @@ namespace Lucene.Net.Codecs.Lucene42 bytesReader = fst.GetBytesReader(); } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() => @in.MoveNext(); + public override BytesRef Next() { var io = @in.Next(); diff --git a/src/Lucene.Net/Codecs/Lucene45/Lucene45DocValuesProducer.cs b/src/Lucene.Net/Codecs/Lucene45/Lucene45DocValuesProducer.cs index faaea83..9957b2c 100644 --- a/src/Lucene.Net/Codecs/Lucene45/Lucene45DocValuesProducer.cs +++ b/src/Lucene.Net/Codecs/Lucene45/Lucene45DocValuesProducer.cs @@ -1074,7 +1074,7 @@ namespace Lucene.Net.Codecs.Lucene45 { private readonly CompressedBinaryDocValues outerInstance; - private IndexInput input; + private readonly IndexInput input; public TermsEnumAnonymousInnerClassHelper(CompressedBinaryDocValues outerInstance, IndexInput input) { @@ -1092,6 +1092,24 @@ namespace Lucene.Net.Codecs.Lucene45 private readonly BytesRef term; + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + if (++currentOrd >= outerInstance.numValues) + { + return false; + } + else + { + int start = input.ReadVInt32(); + int suffix = input.ReadVInt32(); + input.ReadBytes(termBuffer.Bytes, start, suffix); + termBuffer.Length = start + suffix; + SetTerm(); + return true; + } + } + public override BytesRef Next() { if (DoNext() == null) diff --git a/src/Lucene.Net/Index/DocTermOrds.cs b/src/Lucene.Net/Index/DocTermOrds.cs index c49bbc4..1c4fe77 100644 --- a/src/Lucene.Net/Index/DocTermOrds.cs +++ b/src/Lucene.Net/Index/DocTermOrds.cs @@ -770,6 +770,22 @@ namespace Lucene.Net.Index public override BytesRef Term => term; + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + if (++ord < 0) + { + ord = 0; + } + if (!termsEnum.MoveNext()) + { + term = null; + return false; + } + SetTerm(); // this is extra work if we know we are in bounds... + return true; + } + public override BytesRef Next() { if (++ord < 0) diff --git a/src/Lucene.Net/Index/FilterAtomicReader.cs b/src/Lucene.Net/Index/FilterAtomicReader.cs index b7c2925..8f5bdca 100644 --- a/src/Lucene.Net/Index/FilterAtomicReader.cs +++ b/src/Lucene.Net/Index/FilterAtomicReader.cs @@ -161,6 +161,12 @@ namespace Lucene.Net.Index m_input.SeekExact(ord); } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + return m_input.MoveNext(); + } + public override BytesRef Next() { return m_input.Next(); diff --git a/src/Lucene.Net/Index/FilteredTermsEnum.cs b/src/Lucene.Net/Index/FilteredTermsEnum.cs index 7a5b380..e885588 100644 --- a/src/Lucene.Net/Index/FilteredTermsEnum.cs +++ b/src/Lucene.Net/Index/FilteredTermsEnum.cs @@ -117,7 +117,7 @@ namespace Lucene.Net.Index } /// <summary> - /// On the first call to <see cref="Next()"/> or if <see cref="Accept(BytesRef)"/> returns + /// On the first call to <see cref="MoveNext()"/> or if <see cref="Accept(BytesRef)"/> returns /// <see cref="AcceptStatus.YES_AND_SEEK"/> or <see cref="AcceptStatus.NO_AND_SEEK"/>, /// this method will be called to eventually seek the underlying <see cref="TermsEnum"/> /// to a new position. @@ -211,6 +211,63 @@ namespace Lucene.Net.Index return tenum.GetTermState(); } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + //System.out.println("FTE.next doSeek=" + doSeek); + //new Throwable().printStackTrace(System.out); + for (; ; ) + { + // Seek or forward the iterator + if (doSeek) + { + doSeek = false; + BytesRef t = NextSeekTerm(actualTerm); + //System.out.println(" seek to t=" + (t == null ? "null" : t.utf8ToString()) + " tenum=" + tenum); + // Make sure we always seek forward: + if (Debugging.AssertsEnabled) Debugging.Assert(actualTerm == null || t == null || Comparer.Compare(t, actualTerm) > 0, () => "curTerm=" + actualTerm + " seekTerm=" + t); + if (t == null || tenum.SeekCeil(t) == SeekStatus.END) + { + // no more terms to seek to or enum exhausted + //System.out.println(" return null"); + return false; + } + actualTerm = tenum.Term; + //System.out.println(" got term=" + actualTerm.utf8ToString()); + } + else + { + actualTerm = tenum.Next(); + if (actualTerm == null) + { + // enum exhausted + return false; + } + } + + // check if term is accepted + switch (Accept(actualTerm)) + { + case FilteredTermsEnum.AcceptStatus.YES_AND_SEEK: + doSeek = true; + // term accepted, but we need to seek so fall-through + goto case FilteredTermsEnum.AcceptStatus.YES; + case FilteredTermsEnum.AcceptStatus.YES: + // term accepted + return true; + + case FilteredTermsEnum.AcceptStatus.NO_AND_SEEK: + // invalid term, seek next time + doSeek = true; + break; + + case FilteredTermsEnum.AcceptStatus.END: + // we are supposed to end the enum + return false; + } + } + } + public override BytesRef Next() { //System.out.println("FTE.next doSeek=" + doSeek); diff --git a/src/Lucene.Net/Index/MultiTermsEnum.cs b/src/Lucene.Net/Index/MultiTermsEnum.cs index afe5955..2981360 100644 --- a/src/Lucene.Net/Index/MultiTermsEnum.cs +++ b/src/Lucene.Net/Index/MultiTermsEnum.cs @@ -356,6 +356,39 @@ namespace Lucene.Net.Index numTop = 0; } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + if (lastSeekExact) + { + // Must SeekCeil at this point, so those subs that + // didn't have the term can find the following term. + // NOTE: we could save some CPU by only SeekCeil the + // subs that didn't match the last exact seek... but + // most impls short-circuit if you SeekCeil to term + // they are already on. + SeekStatus status = SeekCeil(current); + if (Debugging.AssertsEnabled) Debugging.Assert(status == SeekStatus.FOUND); + lastSeekExact = false; + } + lastSeek = null; + + // restore queue + PushTop(); + + // gather equal top fields + if (queue.Count > 0) + { + PullTop(); + return !(current is null); + } + else + { + current = null; + return false; + } + } + public override BytesRef Next() { if (lastSeekExact) diff --git a/src/Lucene.Net/Index/SortedDocValuesTermsEnum.cs b/src/Lucene.Net/Index/SortedDocValuesTermsEnum.cs index 9bc7742..16919be 100644 --- a/src/Lucene.Net/Index/SortedDocValuesTermsEnum.cs +++ b/src/Lucene.Net/Index/SortedDocValuesTermsEnum.cs @@ -99,6 +99,18 @@ namespace Lucene.Net.Index values.LookupOrd(currentOrd, term); } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + currentOrd++; + if (currentOrd >= values.ValueCount) + { + return false; + } + values.LookupOrd(currentOrd, term); + return true; + } + public override BytesRef Next() { currentOrd++; diff --git a/src/Lucene.Net/Index/SortedSetDocValuesTermsEnum.cs b/src/Lucene.Net/Index/SortedSetDocValuesTermsEnum.cs index baef17a..337f055 100644 --- a/src/Lucene.Net/Index/SortedSetDocValuesTermsEnum.cs +++ b/src/Lucene.Net/Index/SortedSetDocValuesTermsEnum.cs @@ -99,6 +99,18 @@ namespace Lucene.Net.Index values.LookupOrd(currentOrd, term); } + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + currentOrd++; + if (currentOrd >= values.ValueCount) + { + return false; + } + values.LookupOrd(currentOrd, term); + return true; + } + public override BytesRef Next() { currentOrd++; diff --git a/src/Lucene.Net/Index/Terms.cs b/src/Lucene.Net/Index/Terms.cs index 39f5450..81187bf 100644 --- a/src/Lucene.Net/Index/Terms.cs +++ b/src/Lucene.Net/Index/Terms.cs @@ -72,7 +72,7 @@ namespace Lucene.Net.Index /// <see cref="CompiledAutomaton"/>. If the <paramref name="startTerm"/> is /// provided then the returned enum will only accept terms /// > <paramref name="startTerm"/>, but you still must call - /// <see cref="TermsEnum.Next()"/> first to get to the first term. Note that the + /// <see cref="TermsEnum.MoveNext()"/> first to get to the first term. Note that the /// provided <paramref name="startTerm"/> must be accepted by /// the automaton. /// diff --git a/src/Lucene.Net/Index/TermsEnum.cs b/src/Lucene.Net/Index/TermsEnum.cs index 58f492f..fa42619 100644 --- a/src/Lucene.Net/Index/TermsEnum.cs +++ b/src/Lucene.Net/Index/TermsEnum.cs @@ -1,6 +1,7 @@ using Lucene.Net.Util; using System; using System.Collections.Generic; +using System.ComponentModel; namespace Lucene.Net.Index { @@ -45,15 +46,14 @@ namespace Lucene.Net.Index /// </summary> public abstract class TermsEnum : IBytesRefEnumerator, IBytesRefIterator // LUCENENET specific: Implemented IBytesRefEnumerator for .NET compatibility { - protected BytesRef m_current; - public abstract IComparer<BytesRef> Comparer { get; } // LUCENENET specific - must supply implementation for the interface /// <inheritdoc/> public abstract BytesRef Next(); // LUCENENET specific - must supply implementation for the interface /// <inheritdoc/> - public virtual BytesRef Current => m_current; // LUCENENET specific - made into enumerator for foreach + [EditorBrowsable(EditorBrowsableState.Never)] + public BytesRef Current => Term; // LUCENENET specific - made into enumerator for foreach /// <summary> /// Moves to the next item in the <see cref="TermsEnum"/>. @@ -62,11 +62,7 @@ namespace Lucene.Net.Index /// </summary> /// <returns><c>true</c> if the enumerator was successfully advanced to the next element; /// <c>false</c> if the enumerator has passed the end of the collection.</returns> - public virtual bool MoveNext() // LUCENENET specific - made into enumerator for foreach - { - m_current = Next(); - return !(m_current is null); - } + public abstract bool MoveNext(); // LUCENENET specific - made into enumerator for foreach private AttributeSource atts = null; @@ -295,10 +291,6 @@ namespace Lucene.Net.Index private class TermsEnumAnonymousInnerClassHelper : TermsEnum { - public TermsEnumAnonymousInnerClassHelper() - { - } - public override SeekStatus SeekCeil(BytesRef term) { return SeekStatus.END; @@ -328,6 +320,12 @@ namespace Lucene.Net.Index throw new InvalidOperationException("this method should never be called"); } + // LUCENENET specific + public override bool MoveNext() + { + return false; + } + public override BytesRef Next() { return null; diff --git a/src/Lucene.Net/Search/FuzzyTermsEnum.cs b/src/Lucene.Net/Search/FuzzyTermsEnum.cs index 14039c3..77e1fd2 100644 --- a/src/Lucene.Net/Search/FuzzyTermsEnum.cs +++ b/src/Lucene.Net/Search/FuzzyTermsEnum.cs @@ -273,6 +273,33 @@ namespace Lucene.Net.Search private BytesRef queuedBottom = null; + // LUCENENET specific - duplicate logic for better enumerator optimization + public override bool MoveNext() + { + if (queuedBottom != null) + { + BottomChanged(queuedBottom, false); + queuedBottom = null; + } + + BytesRef term = actualEnum.Next(); + boostAtt.Boost = actualBoostAtt.Boost; + + float bottom = maxBoostAtt.MaxNonCompetitiveBoost; + BytesRef bottomTerm = maxBoostAtt.CompetitiveTerm; + if (actualEnum.MoveNext() && (bottom != this.bottom || bottomTerm != this.bottomTerm)) + { + this.bottom = bottom; + this.bottomTerm = bottomTerm; + // clone the term before potentially doing something with it + // this is a rare but wonderful occurrence anyway + queuedBottom = BytesRef.DeepCopyOf(actualEnum.Term); + return true; + } + + return false; + } + public override BytesRef Next() { if (queuedBottom != null) diff --git a/src/Lucene.Net/Util/Fst/BytesRefFSTEnum.cs b/src/Lucene.Net/Util/Fst/BytesRefFSTEnum.cs index d28ba82..6dc30d5 100644 --- a/src/Lucene.Net/Util/Fst/BytesRefFSTEnum.cs +++ b/src/Lucene.Net/Util/Fst/BytesRefFSTEnum.cs @@ -47,6 +47,24 @@ namespace Lucene.Net.Util.Fst public BytesRefFSTEnum.InputOutput<T> Current => result; + // LUCENENET specific - duplicate logic for better enumerator optimization + public bool MoveNext() + { + //System.out.println(" enum.next"); + DoNext(); + + if (m_upto == 0) + { + return false; + } + else + { + current.Length = m_upto - 1; + result.Output = m_output[m_upto]; + return true; + } + } + public BytesRefFSTEnum.InputOutput<T> Next() { //System.out.println(" enum.next");
