Lucene.Net.Codecs.BlockTerms (FixedGapTermsIndexWriter + VariableGapTermsIndexWriter + VariableGapTermsIndexReader): Reviewed line-by-line and fixed several formatting issues
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/af9535fa Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/af9535fa Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/af9535fa Branch: refs/heads/api-work Commit: af9535fa26dfd0c514c30764b7c1d821a94300d2 Parents: 1cf1b11 Author: Shad Storhaug <[email protected]> Authored: Sat Mar 18 00:28:28 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Sat Mar 18 03:21:48 2017 +0700 ---------------------------------------------------------------------- .../BlockTerms/FixedGapTermsIndexWriter.cs | 2 + .../BlockTerms/VariableGapTermsIndexReader.cs | 109 +++++++++++------- .../BlockTerms/VariableGapTermsIndexWriter.cs | 115 +++++++++++-------- 3 files changed, 137 insertions(+), 89 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/af9535fa/src/Lucene.Net.Codecs/BlockTerms/FixedGapTermsIndexWriter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Codecs/BlockTerms/FixedGapTermsIndexWriter.cs b/src/Lucene.Net.Codecs/BlockTerms/FixedGapTermsIndexWriter.cs index c1731da..1751f83 100644 --- a/src/Lucene.Net.Codecs/BlockTerms/FixedGapTermsIndexWriter.cs +++ b/src/Lucene.Net.Codecs/BlockTerms/FixedGapTermsIndexWriter.cs @@ -136,6 +136,7 @@ namespace Lucene.Net.Codecs.BlockTerms internal SimpleFieldWriter(FixedGapTermsIndexWriter outerInstance, FieldInfo fieldInfo, long termsFilePointer) { this.outerInstance = outerInstance; + this.fieldInfo = fieldInfo; indexStart = outerInstance.m_output.FilePointer; termsStart = lastTermsPointer = termsFilePointer; @@ -164,6 +165,7 @@ namespace Lucene.Net.Codecs.BlockTerms public override void Add(BytesRef text, TermStats stats, long termsFilePointer) { int indexedTermLength = outerInstance.IndexedTermPrefixLength(lastTerm, text); + //System.out.println("FGW: add text=" + text.utf8ToString() + " " + text + " fp=" + termsFilePointer); // write only the min prefix that shows the diff // against prior term http://git-wip-us.apache.org/repos/asf/lucenenet/blob/af9535fa/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexReader.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexReader.cs b/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexReader.cs index 321bf1c..340b18e 100644 --- a/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexReader.cs +++ b/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexReader.cs @@ -56,8 +56,7 @@ namespace Lucene.Net.Codecs.BlockTerms IndexFileNames.SegmentFileName(segment, segmentSuffix, VariableGapTermsIndexWriter.TERMS_INDEX_EXTENSION), new IOContext(context, true)); this.segment = segment; - var success = false; - + bool success = false; Debug.Assert(indexDivisor == -1 || indexDivisor > 0); try @@ -66,12 +65,14 @@ namespace Lucene.Net.Codecs.BlockTerms _indexDivisor = indexDivisor; if (_version >= VariableGapTermsIndexWriter.VERSION_CHECKSUM) + { CodecUtil.ChecksumEntireFile(_input); + } SeekDir(_input, _dirOffset); // Read directory - var numFields = _input.ReadVInt32(); + int numFields = _input.ReadVInt32(); if (numFields < 0) { throw new CorruptIndexException("invalid numFields: " + numFields + " (resource=" + _input + ")"); @@ -79,9 +80,9 @@ namespace Lucene.Net.Codecs.BlockTerms for (var i = 0; i < numFields; i++) { - var field = _input.ReadVInt32(); - var indexStart = _input.ReadVInt64(); - var fieldInfo = fieldInfos.FieldInfo(field); + int field = _input.ReadVInt32(); + long indexStart = _input.ReadVInt64(); + FieldInfo fieldInfo = fieldInfos.FieldInfo(field); FieldIndexData previous = _fields.Put(fieldInfo, new FieldIndexData(this, fieldInfo, indexStart)); if (previous != null) { @@ -137,9 +138,11 @@ namespace Lucene.Net.Codecs.BlockTerms public override long Seek(BytesRef target) { + //System.out.println("VGR: seek field=" + fieldInfo.name + " target=" + target); _current = _fstEnum.SeekFloor(target); if (_current.Output.HasValue) { + //System.out.println(" got input=" + current.input + " output=" + current.output); return _current.Output.Value; } throw new NullReferenceException("_current.Output is null"); // LUCENENET NOTE: NullReferenceException would be thrown in Java, so doing it here @@ -147,11 +150,19 @@ namespace Lucene.Net.Codecs.BlockTerms public override long Next() { + //System.out.println("VGR: next field=" + fieldInfo.name); _current = _fstEnum.Next(); if (_current == null) + { + //System.out.println(" eof"); return -1; + } - return _current.Output.Value; + if (_current.Output.HasValue) + { + return _current.Output.Value; + } + throw new NullReferenceException("_current.Output is null"); // LUCENENET NOTE: NullReferenceException would be thrown in Java, so doing it here } public override long Ord @@ -181,49 +192,52 @@ namespace Lucene.Net.Codecs.BlockTerms public FieldIndexData(VariableGapTermsIndexReader outerInstance, FieldInfo fieldInfo, long indexStart) { this.outerInstance = outerInstance; + _indexStart = indexStart; if (this.outerInstance._indexDivisor > 0) + { LoadTermsIndex(); + } } private void LoadTermsIndex() { - if (fst != null) return; - - var clone = (IndexInput)outerInstance._input.Clone(); - clone.Seek(_indexStart); - fst = new FST<long?>(clone, outerInstance._fstOutputs); - clone.Dispose(); - - /* - final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; - Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); - Util.toDot(fst, w, false, false); - System.out.println("FST INDEX: SAVED to " + dotFileName); - w.close(); - */ - - if (outerInstance._indexDivisor > 1) + if (fst == null) { - // subsample - var scratchIntsRef = new Int32sRef(); - var outputs = PositiveInt32Outputs.Singleton; - var builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs); - var fstEnum = new BytesRefFSTEnum<long?>(fst); - var count = outerInstance._indexDivisor; - - BytesRefFSTEnum.InputOutput<long?> result; - while ((result = fstEnum.Next()) != null) + IndexInput clone = (IndexInput)outerInstance._input.Clone(); + clone.Seek(_indexStart); + fst = new FST<long?>(clone, outerInstance._fstOutputs); + clone.Dispose(); // LUCENENET TODO: No using block here is bad... + + /* + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + Util.toDot(fst, w, false, false); + System.out.println("FST INDEX: SAVED to " + dotFileName); + w.close(); + */ + + if (outerInstance._indexDivisor > 1) { - if (count == outerInstance._indexDivisor) + // subsample + Int32sRef scratchIntsRef = new Int32sRef(); + PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; + Builder<long?> builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs); + BytesRefFSTEnum<long?> fstEnum = new BytesRefFSTEnum<long?>(fst); + BytesRefFSTEnum.InputOutput<long?> result; + int count = outerInstance._indexDivisor; + while ((result = fstEnum.Next()) != null) { - builder.Add(Util.Fst.Util.ToInt32sRef(result.Input, scratchIntsRef), result.Output); - count = 0; + if (count == outerInstance._indexDivisor) + { + builder.Add(Util.Fst.Util.ToInt32sRef(result.Input, scratchIntsRef), result.Output); + count = 0; + } + count++; } - count++; + fst = builder.Finish(); } - fst = builder.Finish(); } } @@ -236,13 +250,21 @@ namespace Lucene.Net.Codecs.BlockTerms public override FieldIndexEnum GetFieldEnum(FieldInfo fieldInfo) { - FieldIndexData fieldData = _fields[fieldInfo]; - return fieldData.fst == null ? null : new IndexEnum(fieldData.fst); + FieldIndexData fieldData; + if (!_fields.TryGetValue(fieldInfo, out fieldData) || fieldData == null) + { + return null; + } + else + { + return new IndexEnum(fieldData.fst); + } } public override void Dispose() { - if (_input != null && !_indexLoaded) { + if (_input != null && !_indexLoaded) + { _input.Dispose(); } } @@ -264,7 +286,12 @@ namespace Lucene.Net.Codecs.BlockTerms public override long RamBytesUsed() { - return _fields.Values.Sum(entry => entry.RamBytesUsed()); + long sizeInBytes = 0; + foreach (FieldIndexData entry in _fields.Values) + { + sizeInBytes += entry.RamBytesUsed(); + } + return sizeInBytes; } } } http://git-wip-us.apache.org/repos/asf/lucenenet/blob/af9535fa/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs b/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs index fae7620..f545666 100644 --- a/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs +++ b/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs @@ -38,16 +38,18 @@ namespace Lucene.Net.Codecs.BlockTerms { protected IndexOutput m_output; - /** Extension of terms index file */ + /// <summary>Extension of terms index file</summary> internal const string TERMS_INDEX_EXTENSION = "tiv"; + internal const string CODEC_NAME = "VARIABLE_GAP_TERMS_INDEX"; internal const int VERSION_START = 0; internal const int VERSION_APPEND_ONLY = 1; internal const int VERSION_CHECKSUM = 2; internal const int VERSION_CURRENT = VERSION_CHECKSUM; - private readonly List<FstFieldWriter> _fields = new List<FstFieldWriter>(); + private readonly IList<FstFieldWriter> _fields = new List<FstFieldWriter>(); + private readonly FieldInfos fieldInfos; // unread private readonly IndexTermSelector _policy; /// <summary> @@ -81,19 +83,22 @@ namespace Lucene.Net.Codecs.BlockTerms public EveryNTermSelector(int interval) { this._interval = interval; - _count = interval; // First term is first indexed term + // First term is first indexed term + _count = interval; } public override bool IsIndexTerm(BytesRef term, TermStats stats) { - if (_count >= _interval) + if (_count >= _interval) { _count = 1; return true; } - - _count++; - return false; + else + { + _count++; + return false; + } } public override void NewField(FieldInfo fieldInfo) @@ -115,9 +120,11 @@ namespace Lucene.Net.Codecs.BlockTerms public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval) { - _interval = interval; - _docFreqThresh = docFreqThresh; - _count = interval; // First term is first indexed term + this._interval = interval; + this._docFreqThresh = docFreqThresh; + + // First term is first indexed term + _count = interval; } public override bool IsIndexTerm(BytesRef term, TermStats stats) @@ -127,9 +134,11 @@ namespace Lucene.Net.Codecs.BlockTerms _count = 1; return true; } - - _count++; - return false; + else + { + _count++; + return false; + } } public override void NewField(FieldInfo fieldInfo) @@ -177,17 +186,19 @@ namespace Lucene.Net.Codecs.BlockTerms TERMS_INDEX_EXTENSION); m_output = state.Directory.CreateOutput(indexFileName, state.Context); bool success = false; - try { - _policy = policy; + fieldInfos = state.FieldInfos; + this._policy = policy; WriteHeader(m_output); success = true; } finally { if (!success) + { IOUtils.CloseWhileHandlingException(m_output); + } } } @@ -198,8 +209,9 @@ namespace Lucene.Net.Codecs.BlockTerms public override FieldWriter AddField(FieldInfo field, long termsFilePointer) { + ////System.out.println("VGW: field=" + field.name); _policy.NewField(field); - var writer = new FstFieldWriter(this, field, termsFilePointer); + FstFieldWriter writer = new FstFieldWriter(this, field, termsFilePointer); _fields.Add(writer); return writer; } @@ -233,7 +245,7 @@ namespace Lucene.Net.Codecs.BlockTerms private readonly VariableGapTermsIndexWriter outerInstance; private readonly Builder<long?> _fstBuilder; - //private readonly PositiveIntOutputs fstOutputs; // LUCENENET NOTE: Not used + private readonly PositiveInt32Outputs fstOutputs; private readonly long _startTermsFilePointer; internal FieldInfo FieldInfo { get; private set; } @@ -246,10 +258,12 @@ namespace Lucene.Net.Codecs.BlockTerms public FstFieldWriter(VariableGapTermsIndexWriter outerInstance, FieldInfo fieldInfo, long termsFilePointer) { this.outerInstance = outerInstance; - FieldInfo = fieldInfo; - PositiveInt32Outputs fstOutputs = PositiveInt32Outputs.Singleton; + + this.FieldInfo = fieldInfo; + fstOutputs = PositiveInt32Outputs.Singleton; _fstBuilder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, fstOutputs); IndexStart = this.outerInstance.m_output.FilePointer; + ////System.out.println("VGW: field=" + fieldInfo.name); // Always put empty string in _fstBuilder.Add(new Int32sRef(), termsFilePointer); @@ -265,9 +279,11 @@ namespace Lucene.Net.Codecs.BlockTerms _first = false; return true; } - - _lastTerm.CopyBytes(text); - return false; + else + { + _lastTerm.CopyBytes(text); + return false; + } } private readonly Int32sRef _scratchIntsRef = new Int32sRef(); @@ -297,46 +313,49 @@ namespace Lucene.Net.Codecs.BlockTerms { Fst = _fstBuilder.Finish(); if (Fst != null) + { Fst.Save(outerInstance.m_output); + } } } public override void Dispose() { - if (m_output == null) return; - - try + if (m_output != null) { - long dirStart = m_output.FilePointer; - int fieldCount = _fields.Count; - - int nonNullFieldCount = 0; - for (int i = 0; i < fieldCount; i++) + try { - FstFieldWriter field = _fields[i]; - if (field.Fst != null) + long dirStart = m_output.FilePointer; + int fieldCount = _fields.Count; + + int nonNullFieldCount = 0; + for (int i = 0; i < fieldCount; i++) { - nonNullFieldCount++; + FstFieldWriter field = _fields[i]; + if (field.Fst != null) + { + nonNullFieldCount++; + } } - } - m_output.WriteVInt32(nonNullFieldCount); - for (int i = 0; i < fieldCount; i++) - { - FstFieldWriter field = _fields[i]; - if (field.Fst != null) + m_output.WriteVInt32(nonNullFieldCount); + for (int i = 0; i < fieldCount; i++) { - m_output.WriteVInt32(field.FieldInfo.Number); - m_output.WriteVInt64(field.IndexStart); + FstFieldWriter field = _fields[i]; + if (field.Fst != null) + { + m_output.WriteVInt32(field.FieldInfo.Number); + m_output.WriteVInt64(field.IndexStart); + } } + WriteTrailer(dirStart); + CodecUtil.WriteFooter(m_output); + } + finally + { + m_output.Dispose(); + m_output = null; } - WriteTrailer(dirStart); - CodecUtil.WriteFooter(m_output); - } - finally - { - m_output.Dispose(); - m_output = null; } }
