http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Index/TestTermVectorsReader.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Index/TestTermVectorsReader.cs b/src/Lucene.Net.Tests/Index/TestTermVectorsReader.cs new file mode 100644 index 0000000..8426151 --- /dev/null +++ b/src/Lucene.Net.Tests/Index/TestTermVectorsReader.cs @@ -0,0 +1,477 @@ +using Lucene.Net.Analysis.TokenAttributes; +using System; +using Lucene.Net.Documents; + +namespace Lucene.Net.Index +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using Lucene.Net.Analysis; + using NUnit.Framework; + using System.IO; + using BytesRef = Lucene.Net.Util.BytesRef; + using Codec = Lucene.Net.Codecs.Codec; + using Directory = Lucene.Net.Store.Directory; + using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; + using Document = Documents.Document; + using Field = Field; + using FieldType = FieldType; + using LuceneTestCase = Lucene.Net.Util.LuceneTestCase; + using TermVectorsReader = Lucene.Net.Codecs.TermVectorsReader; + using TestUtil = Lucene.Net.Util.TestUtil; + using TextField = TextField; + + [TestFixture] + public class TestTermVectorsReader : LuceneTestCase + { + public TestTermVectorsReader() + { + InitializeInstanceFields(); + } + + private void InitializeInstanceFields() + { + Positions = new int[TestTerms.Length][]; + Tokens = new TestToken[TestTerms.Length * TERM_FREQ]; + } + + //Must be lexicographically sorted, will do in setup, versus trying to maintain here + private string[] TestFields = new string[] { "f1", "f2", "f3", "f4" }; + + private bool[] TestFieldsStorePos = new bool[] { true, false, true, false }; + private bool[] TestFieldsStoreOff = new bool[] { true, false, false, true }; + private string[] TestTerms = new string[] { "this", "is", "a", "test" }; + private int[][] Positions; + private Directory Dir; + private SegmentCommitInfo Seg; + private FieldInfos FieldInfos = new FieldInfos(new FieldInfo[0]); + private static int TERM_FREQ = 3; + + internal class TestToken : IComparable<TestToken> + { + private readonly TestTermVectorsReader OuterInstance; + + public TestToken(TestTermVectorsReader outerInstance) + { + this.OuterInstance = outerInstance; + } + + internal string Text; + internal int Pos; + internal int StartOffset; + internal int EndOffset; + + public virtual int CompareTo(TestToken other) + { + return Pos - other.Pos; + } + } + + internal TestToken[] Tokens; + + [SetUp] + public override void SetUp() + { + base.SetUp(); + /* + for (int i = 0; i < testFields.Length; i++) { + fieldInfos.Add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]); + } + */ + + Array.Sort(TestTerms); + int tokenUpto = 0; + for (int i = 0; i < TestTerms.Length; i++) + { + Positions[i] = new int[TERM_FREQ]; + // first position must be 0 + for (int j = 0; j < TERM_FREQ; j++) + { + // positions are always sorted in increasing order + Positions[i][j] = (int)(j * 10 + new Random(1).NextDouble() * 10); + TestToken token = Tokens[tokenUpto++] = new TestToken(this); + token.Text = TestTerms[i]; + token.Pos = Positions[i][j]; + token.StartOffset = j * 10; + token.EndOffset = j * 10 + TestTerms[i].Length; + } + } + Array.Sort(Tokens); + + Dir = NewDirectory(); + IndexWriter writer = new IndexWriter(Dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MyAnalyzer(this)).SetMaxBufferedDocs(-1).SetMergePolicy(NewLogMergePolicy(false, 10)).SetUseCompoundFile(false)); + + Document doc = new Document(); + for (int i = 0; i < TestFields.Length; i++) + { + FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); + if (TestFieldsStorePos[i] && TestFieldsStoreOff[i]) + { + customType.StoreTermVectors = true; + customType.StoreTermVectorPositions = true; + customType.StoreTermVectorOffsets = true; + } + else if (TestFieldsStorePos[i] && !TestFieldsStoreOff[i]) + { + customType.StoreTermVectors = true; + customType.StoreTermVectorPositions = true; + } + else if (!TestFieldsStorePos[i] && TestFieldsStoreOff[i]) + { + customType.StoreTermVectors = true; + customType.StoreTermVectorOffsets = true; + } + else + { + customType.StoreTermVectors = true; + } + doc.Add(new Field(TestFields[i], "", customType)); + } + + //Create 5 documents for testing, they all have the same + //terms + for (int j = 0; j < 5; j++) + { + writer.AddDocument(doc); + } + writer.Commit(); + Seg = writer.NewestSegment(); + writer.Dispose(); + + FieldInfos = SegmentReader.ReadFieldInfos(Seg); + } + + [TearDown] + public override void TearDown() + { + Dir.Dispose(); + base.TearDown(); + } + + private class MyTokenizer : Tokenizer + { + private readonly TestTermVectorsReader OuterInstance; + + internal int TokenUpto; + + internal readonly ICharTermAttribute TermAtt; + internal readonly IPositionIncrementAttribute PosIncrAtt; + internal readonly IOffsetAttribute OffsetAtt; + + public MyTokenizer(TestTermVectorsReader outerInstance, TextReader reader) + : base(reader) + { + this.OuterInstance = outerInstance; + TermAtt = AddAttribute<ICharTermAttribute>(); + PosIncrAtt = AddAttribute<IPositionIncrementAttribute>(); + OffsetAtt = AddAttribute<IOffsetAttribute>(); + } + + public sealed override bool IncrementToken() + { + if (TokenUpto >= OuterInstance.Tokens.Length) + { + return false; + } + else + { + TestToken testToken = OuterInstance.Tokens[TokenUpto++]; + ClearAttributes(); + TermAtt.Append(testToken.Text); + OffsetAtt.SetOffset(testToken.StartOffset, testToken.EndOffset); + if (TokenUpto > 1) + { + PosIncrAtt.PositionIncrement = testToken.Pos - OuterInstance.Tokens[TokenUpto - 2].Pos; + } + else + { + PosIncrAtt.PositionIncrement = testToken.Pos + 1; + } + return true; + } + } + + public override void Reset() + { + base.Reset(); + this.TokenUpto = 0; + } + } + + private class MyAnalyzer : Analyzer + { + private readonly TestTermVectorsReader OuterInstance; + + public MyAnalyzer(TestTermVectorsReader outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + return new TokenStreamComponents(new MyTokenizer(OuterInstance, reader)); + } + } + + [Test] + public virtual void Test() + { + //Check to see the files were created properly in setup + DirectoryReader reader = DirectoryReader.Open(Dir); + foreach (AtomicReaderContext ctx in reader.Leaves) + { + SegmentReader sr = (SegmentReader)ctx.Reader; + Assert.IsTrue(sr.FieldInfos.HasVectors); + } + reader.Dispose(); + } + + [Test] + public virtual void TestReader() + { + TermVectorsReader reader = Codec.Default.TermVectorsFormat.VectorsReader(Dir, Seg.Info, FieldInfos, NewIOContext(Random())); + for (int j = 0; j < 5; j++) + { + Terms vector = reader.Get(j).GetTerms(TestFields[0]); + Assert.IsNotNull(vector); + Assert.AreEqual(TestTerms.Length, vector.Count); + TermsEnum termsEnum = vector.GetIterator(null); + for (int i = 0; i < TestTerms.Length; i++) + { + BytesRef text = termsEnum.Next(); + Assert.IsNotNull(text); + string term = text.Utf8ToString(); + //System.out.println("Term: " + term); + Assert.AreEqual(TestTerms[i], term); + } + Assert.IsNull(termsEnum.Next()); + } + reader.Dispose(); + } + + [Test] + public virtual void TestDocsEnum() + { + TermVectorsReader reader = Codec.Default.TermVectorsFormat.VectorsReader(Dir, Seg.Info, FieldInfos, NewIOContext(Random())); + for (int j = 0; j < 5; j++) + { + Terms vector = reader.Get(j).GetTerms(TestFields[0]); + Assert.IsNotNull(vector); + Assert.AreEqual(TestTerms.Length, vector.Count); + TermsEnum termsEnum = vector.GetIterator(null); + DocsEnum docsEnum = null; + for (int i = 0; i < TestTerms.Length; i++) + { + BytesRef text = termsEnum.Next(); + Assert.IsNotNull(text); + string term = text.Utf8ToString(); + //System.out.println("Term: " + term); + Assert.AreEqual(TestTerms[i], term); + + docsEnum = TestUtil.Docs(Random(), termsEnum, null, docsEnum, DocsEnum.FLAG_NONE); + Assert.IsNotNull(docsEnum); + int doc = docsEnum.DocID; + Assert.AreEqual(-1, doc); + Assert.IsTrue(docsEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docsEnum.NextDoc()); + } + Assert.IsNull(termsEnum.Next()); + } + reader.Dispose(); + } + + [Test] + public virtual void TestPositionReader() + { + TermVectorsReader reader = Codec.Default.TermVectorsFormat.VectorsReader(Dir, Seg.Info, FieldInfos, NewIOContext(Random())); + //BytesRef[] terms; // LUCENENET NOTE: Not used in Lucene + Terms vector = reader.Get(0).GetTerms(TestFields[0]); + Assert.IsNotNull(vector); + Assert.AreEqual(TestTerms.Length, vector.Count); + TermsEnum termsEnum = vector.GetIterator(null); + DocsAndPositionsEnum dpEnum = null; + for (int i = 0; i < TestTerms.Length; i++) + { + BytesRef text = termsEnum.Next(); + Assert.IsNotNull(text); + string term = text.Utf8ToString(); + //System.out.println("Term: " + term); + Assert.AreEqual(TestTerms[i], term); + + dpEnum = termsEnum.DocsAndPositions(null, dpEnum); + Assert.IsNotNull(dpEnum); + int doc = dpEnum.DocID; + Assert.AreEqual(-1, doc); + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + Assert.AreEqual(dpEnum.Freq, Positions[i].Length); + for (int j = 0; j < Positions[i].Length; j++) + { + Assert.AreEqual(Positions[i][j], dpEnum.NextPosition()); + } + Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); + + dpEnum = termsEnum.DocsAndPositions(null, dpEnum); + doc = dpEnum.DocID; + Assert.AreEqual(-1, doc); + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + Assert.IsNotNull(dpEnum); + Assert.AreEqual(dpEnum.Freq, Positions[i].Length); + for (int j = 0; j < Positions[i].Length; j++) + { + Assert.AreEqual(Positions[i][j], dpEnum.NextPosition()); + Assert.AreEqual(j * 10, dpEnum.StartOffset); + Assert.AreEqual(j * 10 + TestTerms[i].Length, dpEnum.EndOffset); + } + Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); + } + + Terms freqVector = reader.Get(0).GetTerms(TestFields[1]); //no pos, no offset + Assert.IsNotNull(freqVector); + Assert.AreEqual(TestTerms.Length, freqVector.Count); + termsEnum = freqVector.GetIterator(null); + Assert.IsNotNull(termsEnum); + for (int i = 0; i < TestTerms.Length; i++) + { + BytesRef text = termsEnum.Next(); + Assert.IsNotNull(text); + string term = text.Utf8ToString(); + //System.out.println("Term: " + term); + Assert.AreEqual(TestTerms[i], term); + Assert.IsNotNull(termsEnum.Docs(null, null)); + Assert.IsNull(termsEnum.DocsAndPositions(null, null)); // no pos + } + reader.Dispose(); + } + + [Test] + public virtual void TestOffsetReader() + { + TermVectorsReader reader = Codec.Default.TermVectorsFormat.VectorsReader(Dir, Seg.Info, FieldInfos, NewIOContext(Random())); + Terms vector = reader.Get(0).GetTerms(TestFields[0]); + Assert.IsNotNull(vector); + TermsEnum termsEnum = vector.GetIterator(null); + Assert.IsNotNull(termsEnum); + Assert.AreEqual(TestTerms.Length, vector.Count); + DocsAndPositionsEnum dpEnum = null; + for (int i = 0; i < TestTerms.Length; i++) + { + BytesRef text = termsEnum.Next(); + Assert.IsNotNull(text); + string term = text.Utf8ToString(); + Assert.AreEqual(TestTerms[i], term); + + dpEnum = termsEnum.DocsAndPositions(null, dpEnum); + Assert.IsNotNull(dpEnum); + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + Assert.AreEqual(dpEnum.Freq, Positions[i].Length); + for (int j = 0; j < Positions[i].Length; j++) + { + Assert.AreEqual(Positions[i][j], dpEnum.NextPosition()); + } + Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); + + dpEnum = termsEnum.DocsAndPositions(null, dpEnum); + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + Assert.IsNotNull(dpEnum); + Assert.AreEqual(dpEnum.Freq, Positions[i].Length); + for (int j = 0; j < Positions[i].Length; j++) + { + Assert.AreEqual(Positions[i][j], dpEnum.NextPosition()); + Assert.AreEqual(j * 10, dpEnum.StartOffset); + Assert.AreEqual(j * 10 + TestTerms[i].Length, dpEnum.EndOffset); + } + Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); + } + reader.Dispose(); + } + + [Test] + public virtual void TestIllegalIndexableField() + { + Directory dir = NewDirectory(); + RandomIndexWriter w = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); + ft.StoreTermVectors = true; + ft.StoreTermVectorPayloads = true; + Document doc = new Document(); + doc.Add(new Field("field", "value", ft)); + try + { + w.AddDocument(doc); + Assert.Fail("did not hit exception"); + } + catch (System.ArgumentException iae) + { + // Expected + Assert.AreEqual("cannot index term vector payloads without term vector positions (field=\"field\")", iae.Message); + } + + ft = new FieldType(TextField.TYPE_NOT_STORED); + ft.StoreTermVectors = false; + ft.StoreTermVectorOffsets = true; + doc = new Document(); + doc.Add(new Field("field", "value", ft)); + try + { + w.AddDocument(doc); + Assert.Fail("did not hit exception"); + } + catch (System.ArgumentException iae) + { + // Expected + Assert.AreEqual("cannot index term vector offsets when term vectors are not indexed (field=\"field\")", iae.Message); + } + + ft = new FieldType(TextField.TYPE_NOT_STORED); + ft.StoreTermVectors = false; + ft.StoreTermVectorPositions = true; + doc = new Document(); + doc.Add(new Field("field", "value", ft)); + try + { + w.AddDocument(doc); + Assert.Fail("did not hit exception"); + } + catch (System.ArgumentException iae) + { + // Expected + Assert.AreEqual("cannot index term vector positions when term vectors are not indexed (field=\"field\")", iae.Message); + } + + ft = new FieldType(TextField.TYPE_NOT_STORED); + ft.StoreTermVectors = false; + ft.StoreTermVectorPayloads = true; + doc = new Document(); + doc.Add(new Field("field", "value", ft)); + try + { + w.AddDocument(doc); + Assert.Fail("did not hit exception"); + } + catch (System.ArgumentException iae) + { + // Expected + Assert.AreEqual("cannot index term vector payloads when term vectors are not indexed (field=\"field\")", iae.Message); + } + + w.Dispose(); + + dir.Dispose(); + } + } +} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Index/TestTermVectorsWriter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Index/TestTermVectorsWriter.cs b/src/Lucene.Net.Tests/Index/TestTermVectorsWriter.cs new file mode 100644 index 0000000..355249e --- /dev/null +++ b/src/Lucene.Net.Tests/Index/TestTermVectorsWriter.cs @@ -0,0 +1,601 @@ +using Lucene.Net.Documents; + +namespace Lucene.Net.Index +{ + using NUnit.Framework; + using System.IO; + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using Analyzer = Lucene.Net.Analysis.Analyzer; + using BytesRef = Lucene.Net.Util.BytesRef; + using CachingTokenFilter = Lucene.Net.Analysis.CachingTokenFilter; + using Directory = Lucene.Net.Store.Directory; + using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; + using Document = Documents.Document; + using Field = Field; + using FieldType = FieldType; + using IOUtils = Lucene.Net.Util.IOUtils; + using LuceneTestCase = Lucene.Net.Util.LuceneTestCase; + using MockAnalyzer = Lucene.Net.Analysis.MockAnalyzer; + using MockDirectoryWrapper = Lucene.Net.Store.MockDirectoryWrapper; + using MockTokenFilter = Lucene.Net.Analysis.MockTokenFilter; + using MockTokenizer = Lucene.Net.Analysis.MockTokenizer; + using RAMDirectory = Lucene.Net.Store.RAMDirectory; + using StringField = StringField; + using TextField = TextField; + using TokenStream = Lucene.Net.Analysis.TokenStream; + + /// <summary> + /// tests for writing term vectors </summary> + [TestFixture] + public class TestTermVectorsWriter : LuceneTestCase + { + // LUCENE-1442 + [Test] + public virtual void TestDoubleOffsetCounting() + { + Directory dir = NewDirectory(); + IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); + Document doc = new Document(); + FieldType customType = new FieldType(StringField.TYPE_NOT_STORED); + customType.StoreTermVectors = true; + customType.StoreTermVectorPositions = true; + customType.StoreTermVectorOffsets = true; + Field f = NewField("field", "abcd", customType); + doc.Add(f); + doc.Add(f); + Field f2 = NewField("field", "", customType); + doc.Add(f2); + doc.Add(f); + w.AddDocument(doc); + w.Dispose(); + + IndexReader r = DirectoryReader.Open(dir); + Terms vector = r.GetTermVectors(0).GetTerms("field"); + Assert.IsNotNull(vector); + TermsEnum termsEnum = vector.GetIterator(null); + Assert.IsNotNull(termsEnum.Next()); + Assert.AreEqual("", termsEnum.Term.Utf8ToString()); + + // Token "" occurred once + Assert.AreEqual(1, termsEnum.TotalTermFreq); + + DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + dpEnum.NextPosition(); + Assert.AreEqual(8, dpEnum.StartOffset); + Assert.AreEqual(8, dpEnum.EndOffset); + Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); + + // Token "abcd" occurred three times + Assert.AreEqual(new BytesRef("abcd"), termsEnum.Next()); + dpEnum = termsEnum.DocsAndPositions(null, dpEnum); + Assert.AreEqual(3, termsEnum.TotalTermFreq); + + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + dpEnum.NextPosition(); + Assert.AreEqual(0, dpEnum.StartOffset); + Assert.AreEqual(4, dpEnum.EndOffset); + + dpEnum.NextPosition(); + Assert.AreEqual(4, dpEnum.StartOffset); + Assert.AreEqual(8, dpEnum.EndOffset); + + dpEnum.NextPosition(); + Assert.AreEqual(8, dpEnum.StartOffset); + Assert.AreEqual(12, dpEnum.EndOffset); + + Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); + Assert.IsNull(termsEnum.Next()); + r.Dispose(); + dir.Dispose(); + } + + // LUCENE-1442 + [Test] + public virtual void TestDoubleOffsetCounting2() + { + Directory dir = NewDirectory(); + IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); + Document doc = new Document(); + FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); + customType.StoreTermVectors = true; + customType.StoreTermVectorPositions = true; + customType.StoreTermVectorOffsets = true; + Field f = NewField("field", "abcd", customType); + doc.Add(f); + doc.Add(f); + w.AddDocument(doc); + w.Dispose(); + + IndexReader r = DirectoryReader.Open(dir); + TermsEnum termsEnum = r.GetTermVectors(0).GetTerms("field").GetIterator(null); + Assert.IsNotNull(termsEnum.Next()); + DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); + Assert.AreEqual(2, termsEnum.TotalTermFreq); + + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + dpEnum.NextPosition(); + Assert.AreEqual(0, dpEnum.StartOffset); + Assert.AreEqual(4, dpEnum.EndOffset); + + dpEnum.NextPosition(); + Assert.AreEqual(5, dpEnum.StartOffset); + Assert.AreEqual(9, dpEnum.EndOffset); + Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); + + r.Dispose(); + dir.Dispose(); + } + + // LUCENE-1448 + [Test] + public virtual void TestEndOffsetPositionCharAnalyzer() + { + Directory dir = NewDirectory(); + IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); + Document doc = new Document(); + FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); + customType.StoreTermVectors = true; + customType.StoreTermVectorPositions = true; + customType.StoreTermVectorOffsets = true; + Field f = NewField("field", "abcd ", customType); + doc.Add(f); + doc.Add(f); + w.AddDocument(doc); + w.Dispose(); + + IndexReader r = DirectoryReader.Open(dir); + TermsEnum termsEnum = r.GetTermVectors(0).GetTerms("field").GetIterator(null); + Assert.IsNotNull(termsEnum.Next()); + DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); + Assert.AreEqual(2, termsEnum.TotalTermFreq); + + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + dpEnum.NextPosition(); + Assert.AreEqual(0, dpEnum.StartOffset); + Assert.AreEqual(4, dpEnum.EndOffset); + + dpEnum.NextPosition(); + Assert.AreEqual(8, dpEnum.StartOffset); + Assert.AreEqual(12, dpEnum.EndOffset); + Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); + + r.Dispose(); + dir.Dispose(); + } + + // LUCENE-1448 + [Test] + public virtual void TestEndOffsetPositionWithCachingTokenFilter() + { + Directory dir = NewDirectory(); + Analyzer analyzer = new MockAnalyzer(Random()); + IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); + Document doc = new Document(); + IOException priorException = null; + TokenStream stream = analyzer.TokenStream("field", new StringReader("abcd ")); + try + { + stream.Reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct? + TokenStream cachedStream = new CachingTokenFilter(stream); + FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); + customType.StoreTermVectors = true; + customType.StoreTermVectorPositions = true; + customType.StoreTermVectorOffsets = true; + Field f = new Field("field", cachedStream, customType); + doc.Add(f); + doc.Add(f); + w.AddDocument(doc); + } + catch (IOException e) + { + priorException = e; + } + finally + { + IOUtils.CloseWhileHandlingException(priorException, stream); + } + w.Dispose(); + + IndexReader r = DirectoryReader.Open(dir); + TermsEnum termsEnum = r.GetTermVectors(0).GetTerms("field").GetIterator(null); + Assert.IsNotNull(termsEnum.Next()); + DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); + Assert.AreEqual(2, termsEnum.TotalTermFreq); + + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + dpEnum.NextPosition(); + Assert.AreEqual(0, dpEnum.StartOffset); + Assert.AreEqual(4, dpEnum.EndOffset); + + dpEnum.NextPosition(); + Assert.AreEqual(8, dpEnum.StartOffset); + Assert.AreEqual(12, dpEnum.EndOffset); + Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); + + r.Dispose(); + dir.Dispose(); + } + + // LUCENE-1448 + [Test] + public virtual void TestEndOffsetPositionStopFilter() + { + Directory dir = NewDirectory(); + IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))); + Document doc = new Document(); + FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); + customType.StoreTermVectors = true; + customType.StoreTermVectorPositions = true; + customType.StoreTermVectorOffsets = true; + Field f = NewField("field", "abcd the", customType); + doc.Add(f); + doc.Add(f); + w.AddDocument(doc); + w.Dispose(); + + IndexReader r = DirectoryReader.Open(dir); + TermsEnum termsEnum = r.GetTermVectors(0).GetTerms("field").GetIterator(null); + Assert.IsNotNull(termsEnum.Next()); + DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); + Assert.AreEqual(2, termsEnum.TotalTermFreq); + + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + dpEnum.NextPosition(); + Assert.AreEqual(0, dpEnum.StartOffset); + Assert.AreEqual(4, dpEnum.EndOffset); + + dpEnum.NextPosition(); + Assert.AreEqual(9, dpEnum.StartOffset); + Assert.AreEqual(13, dpEnum.EndOffset); + Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); + + r.Dispose(); + dir.Dispose(); + } + + // LUCENE-1448 + [Test] + public virtual void TestEndOffsetPositionStandard() + { + Directory dir = NewDirectory(); + IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); + Document doc = new Document(); + FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); + customType.StoreTermVectors = true; + customType.StoreTermVectorPositions = true; + customType.StoreTermVectorOffsets = true; + Field f = NewField("field", "abcd the ", customType); + Field f2 = NewField("field", "crunch man", customType); + doc.Add(f); + doc.Add(f2); + w.AddDocument(doc); + w.Dispose(); + + IndexReader r = DirectoryReader.Open(dir); + TermsEnum termsEnum = r.GetTermVectors(0).GetTerms("field").GetIterator(null); + Assert.IsNotNull(termsEnum.Next()); + DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); + + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + dpEnum.NextPosition(); + Assert.AreEqual(0, dpEnum.StartOffset); + Assert.AreEqual(4, dpEnum.EndOffset); + + Assert.IsNotNull(termsEnum.Next()); + dpEnum = termsEnum.DocsAndPositions(null, dpEnum); + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + dpEnum.NextPosition(); + Assert.AreEqual(11, dpEnum.StartOffset); + Assert.AreEqual(17, dpEnum.EndOffset); + + Assert.IsNotNull(termsEnum.Next()); + dpEnum = termsEnum.DocsAndPositions(null, dpEnum); + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + dpEnum.NextPosition(); + Assert.AreEqual(18, dpEnum.StartOffset); + Assert.AreEqual(21, dpEnum.EndOffset); + + r.Dispose(); + dir.Dispose(); + } + + // LUCENE-1448 + [Test] + public virtual void TestEndOffsetPositionStandardEmptyField() + { + Directory dir = NewDirectory(); + IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); + Document doc = new Document(); + FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); + customType.StoreTermVectors = true; + customType.StoreTermVectorPositions = true; + customType.StoreTermVectorOffsets = true; + Field f = NewField("field", "", customType); + Field f2 = NewField("field", "crunch man", customType); + doc.Add(f); + doc.Add(f2); + w.AddDocument(doc); + w.Dispose(); + + IndexReader r = DirectoryReader.Open(dir); + TermsEnum termsEnum = r.GetTermVectors(0).GetTerms("field").GetIterator(null); + Assert.IsNotNull(termsEnum.Next()); + DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); + + Assert.AreEqual(1, (int)termsEnum.TotalTermFreq); + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + dpEnum.NextPosition(); + Assert.AreEqual(1, dpEnum.StartOffset); + Assert.AreEqual(7, dpEnum.EndOffset); + + Assert.IsNotNull(termsEnum.Next()); + dpEnum = termsEnum.DocsAndPositions(null, dpEnum); + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + dpEnum.NextPosition(); + Assert.AreEqual(8, dpEnum.StartOffset); + Assert.AreEqual(11, dpEnum.EndOffset); + + r.Dispose(); + dir.Dispose(); + } + + // LUCENE-1448 + [Test] + public virtual void TestEndOffsetPositionStandardEmptyField2() + { + Directory dir = NewDirectory(); + IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); + Document doc = new Document(); + FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); + customType.StoreTermVectors = true; + customType.StoreTermVectorPositions = true; + customType.StoreTermVectorOffsets = true; + + Field f = NewField("field", "abcd", customType); + doc.Add(f); + doc.Add(NewField("field", "", customType)); + + Field f2 = NewField("field", "crunch", customType); + doc.Add(f2); + + w.AddDocument(doc); + w.Dispose(); + + IndexReader r = DirectoryReader.Open(dir); + TermsEnum termsEnum = r.GetTermVectors(0).GetTerms("field").GetIterator(null); + Assert.IsNotNull(termsEnum.Next()); + DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); + + Assert.AreEqual(1, (int)termsEnum.TotalTermFreq); + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + dpEnum.NextPosition(); + Assert.AreEqual(0, dpEnum.StartOffset); + Assert.AreEqual(4, dpEnum.EndOffset); + + Assert.IsNotNull(termsEnum.Next()); + dpEnum = termsEnum.DocsAndPositions(null, dpEnum); + Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); + dpEnum.NextPosition(); + Assert.AreEqual(6, dpEnum.StartOffset); + Assert.AreEqual(12, dpEnum.EndOffset); + + r.Dispose(); + dir.Dispose(); + } + + // LUCENE-1168 + [Test] + public virtual void TestTermVectorCorruption() + { + Directory dir = NewDirectory(); + for (int iter = 0; iter < 2; iter++) + { + IndexWriter writer = new IndexWriter(dir, ((IndexWriterConfig)NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMaxBufferedDocs(2).SetRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)).SetMergeScheduler(new SerialMergeScheduler()).SetMergePolicy(new LogDocMergePolicy())); + + Document document = new Document(); + FieldType customType = new FieldType(); + customType.IsStored = true; + + Field storedField = NewField("stored", "stored", customType); + document.Add(storedField); + writer.AddDocument(document); + writer.AddDocument(document); + + document = new Document(); + document.Add(storedField); + FieldType customType2 = new FieldType(StringField.TYPE_NOT_STORED); + customType2.StoreTermVectors = true; + customType2.StoreTermVectorPositions = true; + customType2.StoreTermVectorOffsets = true; + Field termVectorField = NewField("termVector", "termVector", customType2); + + document.Add(termVectorField); + writer.AddDocument(document); + writer.ForceMerge(1); + writer.Dispose(); + + IndexReader reader = DirectoryReader.Open(dir); + for (int i = 0; i < reader.NumDocs; i++) + { + reader.Document(i); + reader.GetTermVectors(i); + } + reader.Dispose(); + + writer = new IndexWriter(dir, ((IndexWriterConfig)NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMaxBufferedDocs(2).SetRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)).SetMergeScheduler(new SerialMergeScheduler()).SetMergePolicy(new LogDocMergePolicy())); + + Directory[] indexDirs = new Directory[] { new MockDirectoryWrapper(Random(), new RAMDirectory(dir, NewIOContext(Random()))) }; + writer.AddIndexes(indexDirs); + writer.ForceMerge(1); + writer.Dispose(); + } + dir.Dispose(); + } + + // LUCENE-1168 + [Test] + public virtual void TestTermVectorCorruption2() + { + Directory dir = NewDirectory(); + for (int iter = 0; iter < 2; iter++) + { + IndexWriter writer = new IndexWriter(dir, ((IndexWriterConfig)NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMaxBufferedDocs(2).SetRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)).SetMergeScheduler(new SerialMergeScheduler()).SetMergePolicy(new LogDocMergePolicy())); + + Document document = new Document(); + + FieldType customType = new FieldType(); + customType.IsStored = true; + + Field storedField = NewField("stored", "stored", customType); + document.Add(storedField); + writer.AddDocument(document); + writer.AddDocument(document); + + document = new Document(); + document.Add(storedField); + FieldType customType2 = new FieldType(StringField.TYPE_NOT_STORED); + customType2.StoreTermVectors = true; + customType2.StoreTermVectorPositions = true; + customType2.StoreTermVectorOffsets = true; + Field termVectorField = NewField("termVector", "termVector", customType2); + document.Add(termVectorField); + writer.AddDocument(document); + writer.ForceMerge(1); + writer.Dispose(); + + IndexReader reader = DirectoryReader.Open(dir); + Assert.IsNull(reader.GetTermVectors(0)); + Assert.IsNull(reader.GetTermVectors(1)); + Assert.IsNotNull(reader.GetTermVectors(2)); + reader.Dispose(); + } + dir.Dispose(); + } + + // LUCENE-1168 + [Test] + public virtual void TestTermVectorCorruption3() + { + Directory dir = NewDirectory(); + IndexWriter writer = new IndexWriter(dir, ((IndexWriterConfig)NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMaxBufferedDocs(2).SetRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)).SetMergeScheduler(new SerialMergeScheduler()).SetMergePolicy(new LogDocMergePolicy())); + + Document document = new Document(); + FieldType customType = new FieldType(); + customType.IsStored = true; + + Field storedField = NewField("stored", "stored", customType); + document.Add(storedField); + FieldType customType2 = new FieldType(StringField.TYPE_NOT_STORED); + customType2.StoreTermVectors = true; + customType2.StoreTermVectorPositions = true; + customType2.StoreTermVectorOffsets = true; + Field termVectorField = NewField("termVector", "termVector", customType2); + document.Add(termVectorField); + for (int i = 0; i < 10; i++) + { + writer.AddDocument(document); + } + writer.Dispose(); + + writer = new IndexWriter(dir, ((IndexWriterConfig)NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMaxBufferedDocs(2).SetRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)).SetMergeScheduler(new SerialMergeScheduler()).SetMergePolicy(new LogDocMergePolicy())); + for (int i = 0; i < 6; i++) + { + writer.AddDocument(document); + } + + writer.ForceMerge(1); + writer.Dispose(); + + IndexReader reader = DirectoryReader.Open(dir); + for (int i = 0; i < 10; i++) + { + reader.GetTermVectors(i); + reader.Document(i); + } + reader.Dispose(); + dir.Dispose(); + } + + // LUCENE-1008 + [Test] + public virtual void TestNoTermVectorAfterTermVector() + { + Directory dir = NewDirectory(); + IndexWriter iw = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); + Document document = new Document(); + FieldType customType2 = new FieldType(StringField.TYPE_NOT_STORED); + customType2.StoreTermVectors = true; + customType2.StoreTermVectorPositions = true; + customType2.StoreTermVectorOffsets = true; + document.Add(NewField("tvtest", "a b c", customType2)); + iw.AddDocument(document); + document = new Document(); + document.Add(NewTextField("tvtest", "x y z", Field.Store.NO)); + iw.AddDocument(document); + // Make first segment + iw.Commit(); + + FieldType customType = new FieldType(StringField.TYPE_NOT_STORED); + customType.StoreTermVectors = true; + document.Add(NewField("tvtest", "a b c", customType)); + iw.AddDocument(document); + // Make 2nd segment + iw.Commit(); + + iw.ForceMerge(1); + iw.Dispose(); + dir.Dispose(); + } + + // LUCENE-1010 + [Test] + public virtual void TestNoTermVectorAfterTermVectorMerge() + { + Directory dir = NewDirectory(); + IndexWriter iw = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); + Document document = new Document(); + FieldType customType = new FieldType(StringField.TYPE_NOT_STORED); + customType.StoreTermVectors = true; + document.Add(NewField("tvtest", "a b c", customType)); + iw.AddDocument(document); + iw.Commit(); + + document = new Document(); + document.Add(NewTextField("tvtest", "x y z", Field.Store.NO)); + iw.AddDocument(document); + // Make first segment + iw.Commit(); + + iw.ForceMerge(1); + + FieldType customType2 = new FieldType(StringField.TYPE_NOT_STORED); + customType2.StoreTermVectors = true; + document.Add(NewField("tvtest", "a b c", customType2)); + iw.AddDocument(document); + // Make 2nd segment + iw.Commit(); + iw.ForceMerge(1); + + iw.Dispose(); + dir.Dispose(); + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Index/TestTermdocPerf.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Index/TestTermdocPerf.cs b/src/Lucene.Net.Tests/Index/TestTermdocPerf.cs new file mode 100644 index 0000000..c76b4ee --- /dev/null +++ b/src/Lucene.Net.Tests/Index/TestTermdocPerf.cs @@ -0,0 +1,176 @@ +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Attributes; +using Lucene.Net.Documents; +using NUnit.Framework; +using System; + +namespace Lucene.Net.Index +{ + using System.IO; + + /* + /// Copyright 2006 The Apache Software Foundation + /// + /// Licensed under the Apache License, Version 2.0 (the "License"); + /// you may not use this file except in compliance with the License. + /// You may obtain a copy of the License at + /// + /// http://www.apache.org/licenses/LICENSE-2.0 + /// + /// Unless required by applicable law or agreed to in writing, software + /// distributed under the License is distributed on an "AS IS" BASIS, + /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + /// See the License for the specific language governing permissions and + /// limitations under the License. + */ + + using Analyzer = Lucene.Net.Analysis.Analyzer; + using BytesRef = Lucene.Net.Util.BytesRef; + using CharTermAttribute = Lucene.Net.Analysis.TokenAttributes.CharTermAttribute; + using Directory = Lucene.Net.Store.Directory; + using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; + using Document = Documents.Document; + using Field = Field; + using LuceneTestCase = Lucene.Net.Util.LuceneTestCase; + using TestUtil = Lucene.Net.Util.TestUtil; + using Tokenizer = Lucene.Net.Analysis.Tokenizer; + + internal class RepeatingTokenizer : Tokenizer + { + private readonly Random Random; + private readonly float PercentDocs; + private readonly int MaxTF; + private int Num; + internal ICharTermAttribute TermAtt; + internal string Value; + + public RepeatingTokenizer(TextReader reader, string val, Random random, float percentDocs, int maxTF) + : base(reader) + { + this.Value = val; + this.Random = random; + this.PercentDocs = percentDocs; + this.MaxTF = maxTF; + this.TermAtt = AddAttribute<ICharTermAttribute>(); + } + + public sealed override bool IncrementToken() + { + Num--; + if (Num >= 0) + { + ClearAttributes(); + TermAtt.Append(Value); + return true; + } + return false; + } + + public override void Reset() + { + base.Reset(); + if (Random.NextDouble() < PercentDocs) + { + Num = Random.Next(MaxTF) + 1; + } + else + { + Num = 0; + } + } + } + + [TestFixture] + public class TestTermdocPerf : LuceneTestCase + { + internal virtual void AddDocs(Random random, Directory dir, int ndocs, string field, string val, int maxTF, float percentDocs) + { + Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(random, val, maxTF, percentDocs); + + Document doc = new Document(); + + doc.Add(NewStringField(field, val, Field.Store.NO)); + IndexWriter writer = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).SetOpenMode(OpenMode.CREATE).SetMaxBufferedDocs(100).SetMergePolicy(NewLogMergePolicy(100))); + + for (int i = 0; i < ndocs; i++) + { + writer.AddDocument(doc); + } + + writer.ForceMerge(1); + writer.Dispose(); + } + + private class AnalyzerAnonymousInnerClassHelper : Analyzer + { + private Random Random; + private string Val; + private int MaxTF; + private float PercentDocs; + + public AnalyzerAnonymousInnerClassHelper(Random random, string val, int maxTF, float percentDocs) + { + this.Random = random; + this.Val = val; + this.MaxTF = maxTF; + this.PercentDocs = percentDocs; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + return new TokenStreamComponents(new RepeatingTokenizer(reader, Val, Random, PercentDocs, MaxTF)); + } + } + + public virtual int DoTest(int iter, int ndocs, int maxTF, float percentDocs) + { + Directory dir = NewDirectory(); + + long start = Environment.TickCount; + AddDocs(Random(), dir, ndocs, "foo", "val", maxTF, percentDocs); + long end = Environment.TickCount; + if (VERBOSE) + { + Console.WriteLine("milliseconds for creation of " + ndocs + " docs = " + (end - start)); + } + + IndexReader reader = DirectoryReader.Open(dir); + + TermsEnum tenum = MultiFields.GetTerms(reader, "foo").GetIterator(null); + + start = Environment.TickCount; + + int ret = 0; + DocsEnum tdocs = null; + Random random = new Random(Random().Next()); + for (int i = 0; i < iter; i++) + { + tenum.SeekCeil(new BytesRef("val")); + tdocs = TestUtil.Docs(random, tenum, MultiFields.GetLiveDocs(reader), tdocs, DocsEnum.FLAG_NONE); + while (tdocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) + { + ret += tdocs.DocID; + } + } + + end = Environment.TickCount; + if (VERBOSE) + { + Console.WriteLine("milliseconds for " + iter + " TermDocs iteration: " + (end - start)); + } + + return ret; + } + +#if !NETSTANDARD + // LUCENENET: There is no Timeout on NUnit for .NET Core. + [Timeout(120000)] +#endif + [Test, LongRunningTest, HasTimeout] + public virtual void TestTermDocPerf() + { + // performance test for 10% of documents containing a term + DoTest(100000, 10000, 3, .1f); + } + } +} \ No newline at end of file
