http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Index/TestTermsEnum.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Index/TestTermsEnum.cs b/src/Lucene.Net.Tests/Index/TestTermsEnum.cs new file mode 100644 index 0000000..ed848b3 --- /dev/null +++ b/src/Lucene.Net.Tests/Index/TestTermsEnum.cs @@ -0,0 +1,1050 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using Lucene.Net.Attributes; +using Lucene.Net.Documents; +using Lucene.Net.Search; + +namespace Lucene.Net.Index +{ + using Lucene.Net.Randomized.Generators; + using NUnit.Framework; + using Automaton = Lucene.Net.Util.Automaton.Automaton; + using BasicAutomata = Lucene.Net.Util.Automaton.BasicAutomata; + using BytesRef = Lucene.Net.Util.BytesRef; + using CompiledAutomaton = Lucene.Net.Util.Automaton.CompiledAutomaton; + using Directory = Lucene.Net.Store.Directory; + using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; + using Document = Documents.Document; + using Field = Field; + using Int32Field = Int32Field; + using LineFileDocs = Lucene.Net.Util.LineFileDocs; + using LuceneTestCase = Lucene.Net.Util.LuceneTestCase; + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using MockAnalyzer = Lucene.Net.Analysis.MockAnalyzer; + using RegExp = Lucene.Net.Util.Automaton.RegExp; + using TestUtil = Lucene.Net.Util.TestUtil; + + [SuppressCodecs("SimpleText", "Memory", "Direct")] + [TestFixture] + public class TestTermsEnum : LuceneTestCase + { + [Test] + public virtual void Test() + { + Random random = new Random(Random().Next()); + LineFileDocs docs = new LineFileDocs(random, DefaultCodecSupportsDocValues()); + Directory d = NewDirectory(); + MockAnalyzer analyzer = new MockAnalyzer(Random()); + analyzer.MaxTokenLength = TestUtil.NextInt(Random(), 1, IndexWriter.MAX_TERM_LENGTH); + RandomIndexWriter w = new RandomIndexWriter(Random(), d, analyzer, Similarity, TimeZone); + int numDocs = AtLeast(10); + for (int docCount = 0; docCount < numDocs; docCount++) + { + w.AddDocument(docs.NextDoc()); + } + IndexReader r = w.Reader; + w.Dispose(); + + List<BytesRef> terms = new List<BytesRef>(); + TermsEnum termsEnum = MultiFields.GetTerms(r, "body").GetIterator(null); + BytesRef term; + while ((term = termsEnum.Next()) != null) + { + terms.Add(BytesRef.DeepCopyOf(term)); + } + if (VERBOSE) + { + Console.WriteLine("TEST: " + terms.Count + " terms"); + } + + int upto = -1; + int iters = AtLeast(200); + for (int iter = 0; iter < iters; iter++) + { + bool isEnd; + if (upto != -1 && Random().NextBoolean()) + { + // next + if (VERBOSE) + { + Console.WriteLine("TEST: iter next"); + } + isEnd = termsEnum.Next() == null; + upto++; + if (isEnd) + { + if (VERBOSE) + { + Console.WriteLine(" end"); + } + Assert.AreEqual(upto, terms.Count); + upto = -1; + } + else + { + if (VERBOSE) + { + Console.WriteLine(" got term=" + termsEnum.Term.Utf8ToString() + " expected=" + terms[upto].Utf8ToString()); + } + Assert.IsTrue(upto < terms.Count); + Assert.AreEqual(terms[upto], termsEnum.Term); + } + } + else + { + BytesRef target; + string exists; + if (Random().NextBoolean()) + { + // likely fake term + if (Random().NextBoolean()) + { + target = new BytesRef(TestUtil.RandomSimpleString(Random())); + } + else + { + target = new BytesRef(TestUtil.RandomRealisticUnicodeString(Random())); + } + exists = "likely not"; + } + else + { + // real term + target = terms[Random().Next(terms.Count)]; + exists = "yes"; + } + + upto = terms.BinarySearch(target); + + if (Random().NextBoolean()) + { + if (VERBOSE) + { + Console.WriteLine("TEST: iter seekCeil target=" + target.Utf8ToString() + " exists=" + exists); + } + // seekCeil + TermsEnum.SeekStatus status = termsEnum.SeekCeil(target); + if (VERBOSE) + { + Console.WriteLine(" got " + status); + } + + if (upto < 0) + { + upto = -(upto + 1); + if (upto >= terms.Count) + { + Assert.AreEqual(TermsEnum.SeekStatus.END, status); + upto = -1; + } + else + { + Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, status); + Assert.AreEqual(terms[upto], termsEnum.Term); + } + } + else + { + Assert.AreEqual(TermsEnum.SeekStatus.FOUND, status); + Assert.AreEqual(terms[upto], termsEnum.Term); + } + } + else + { + if (VERBOSE) + { + Console.WriteLine("TEST: iter seekExact target=" + target.Utf8ToString() + " exists=" + exists); + } + // seekExact + bool result = termsEnum.SeekExact(target); + if (VERBOSE) + { + Console.WriteLine(" got " + result); + } + if (upto < 0) + { + Assert.IsFalse(result); + upto = -1; + } + else + { + Assert.IsTrue(result); + Assert.AreEqual(target, termsEnum.Term); + } + } + } + } + + r.Dispose(); + d.Dispose(); + docs.Dispose(); + } + + private void AddDoc(RandomIndexWriter w, ICollection<string> terms, IDictionary<BytesRef, int?> termToID, int id) + { + Document doc = new Document(); + doc.Add(new Int32Field("id", id, Field.Store.NO)); + if (VERBOSE) + { + Console.WriteLine("TEST: addDoc id:" + id + " terms=" + terms); + } + foreach (string s2 in terms) + { + doc.Add(NewStringField("f", s2, Field.Store.NO)); + termToID[new BytesRef(s2)] = id; + } + w.AddDocument(doc); + terms.Clear(); + } + + private bool Accepts(CompiledAutomaton c, BytesRef b) + { + int state = c.RunAutomaton.InitialState; + for (int idx = 0; idx < b.Length; idx++) + { + Assert.IsTrue(state != -1); + state = c.RunAutomaton.Step(state, b.Bytes[b.Offset + idx] & 0xff); + } + return c.RunAutomaton.IsAccept(state); + } + + // Tests Terms.intersect +#if !NETSTANDARD + // LUCENENET: There is no Timeout on NUnit for .NET Core. + [Timeout(int.MaxValue)] +#endif + [Test, LongRunningTest, HasTimeout] + public virtual void TestIntersectRandom() + { + Directory dir = NewDirectory(); + RandomIndexWriter w = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + + int numTerms = AtLeast(300); + //final int numTerms = 50; + + HashSet<string> terms = new HashSet<string>(); + ICollection<string> pendingTerms = new List<string>(); + IDictionary<BytesRef, int?> termToID = new Dictionary<BytesRef, int?>(); + int id = 0; + while (terms.Count != numTerms) + { + string s = RandomString; + if (!terms.Contains(s)) + { + terms.Add(s); + pendingTerms.Add(s); + if (Random().Next(20) == 7) + { + AddDoc(w, pendingTerms, termToID, id++); + } + } + } + AddDoc(w, pendingTerms, termToID, id++); + + BytesRef[] termsArray = new BytesRef[terms.Count]; + HashSet<BytesRef> termsSet = new HashSet<BytesRef>(); + { + int upto = 0; + foreach (string s in terms) + { + BytesRef b = new BytesRef(s); + termsArray[upto++] = b; + termsSet.Add(b); + } + Array.Sort(termsArray); + } + + if (VERBOSE) + { + Console.WriteLine("\nTEST: indexed terms (unicode order):"); + foreach (BytesRef t in termsArray) + { + Console.WriteLine(" " + t.Utf8ToString() + " -> id:" + termToID[t]); + } + } + + IndexReader r = w.Reader; + w.Dispose(); + + // NOTE: intentional insanity!! + FieldCache.Int32s docIDToID = FieldCache.DEFAULT.GetInt32s(SlowCompositeReaderWrapper.Wrap(r), "id", false); + + for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) + { + // TODO: can we also test infinite As here...? + + // From the random terms, pick some ratio and compile an + // automaton: + HashSet<string> acceptTerms = new HashSet<string>(); + SortedSet<BytesRef> sortedAcceptTerms = new SortedSet<BytesRef>(); + double keepPct = Random().NextDouble(); + Automaton a; + if (iter == 0) + { + if (VERBOSE) + { + Console.WriteLine("\nTEST: empty automaton"); + } + a = BasicAutomata.MakeEmpty(); + } + else + { + if (VERBOSE) + { + Console.WriteLine("\nTEST: keepPct=" + keepPct); + } + foreach (string s in terms) + { + string s2; + if (Random().NextDouble() <= keepPct) + { + s2 = s; + } + else + { + s2 = RandomString; + } + acceptTerms.Add(s2); + sortedAcceptTerms.Add(new BytesRef(s2)); + } + a = BasicAutomata.MakeStringUnion(sortedAcceptTerms); + } + + if (Random().NextBoolean()) + { + if (VERBOSE) + { + Console.WriteLine("TEST: reduce the automaton"); + } + a.Reduce(); + } + + CompiledAutomaton c = new CompiledAutomaton(a, true, false); + + BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.Count]; + HashSet<BytesRef> acceptTermsSet = new HashSet<BytesRef>(); + int upto = 0; + foreach (string s in acceptTerms) + { + BytesRef b = new BytesRef(s); + acceptTermsArray[upto++] = b; + acceptTermsSet.Add(b); + Assert.IsTrue(Accepts(c, b)); + } + Array.Sort(acceptTermsArray); + + if (VERBOSE) + { + Console.WriteLine("\nTEST: accept terms (unicode order):"); + foreach (BytesRef t in acceptTermsArray) + { + Console.WriteLine(" " + t.Utf8ToString() + (termsSet.Contains(t) ? " (exists)" : "")); + } + Console.WriteLine(a.ToDot()); + } + + for (int iter2 = 0; iter2 < 100; iter2++) + { + BytesRef startTerm = acceptTermsArray.Length == 0 || Random().NextBoolean() ? null : acceptTermsArray[Random().Next(acceptTermsArray.Length)]; + + if (VERBOSE) + { + Console.WriteLine("\nTEST: iter2=" + iter2 + " startTerm=" + (startTerm == null ? "<null>" : startTerm.Utf8ToString())); + + if (startTerm != null) + { + int state = c.RunAutomaton.InitialState; + for (int idx = 0; idx < startTerm.Length; idx++) + { + int label = startTerm.Bytes[startTerm.Offset + idx] & 0xff; + Console.WriteLine(" state=" + state + " label=" + label); + state = c.RunAutomaton.Step(state, label); + Assert.IsTrue(state != -1); + } + Console.WriteLine(" state=" + state); + } + } + + TermsEnum te = MultiFields.GetTerms(r, "f").Intersect(c, startTerm); + + int loc; + if (startTerm == null) + { + loc = 0; + } + else + { + loc = Array.BinarySearch(termsArray, BytesRef.DeepCopyOf(startTerm)); + if (loc < 0) + { + loc = -(loc + 1); + } + else + { + // startTerm exists in index + loc++; + } + } + while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc])) + { + loc++; + } + + DocsEnum docsEnum = null; + while (loc < termsArray.Length) + { + BytesRef expected = termsArray[loc]; + BytesRef actual = te.Next(); + if (VERBOSE) + { + Console.WriteLine("TEST: next() expected=" + expected.Utf8ToString() + " actual=" + (actual == null ? "null" : actual.Utf8ToString())); + } + Assert.AreEqual(expected, actual); + Assert.AreEqual(1, te.DocFreq); + docsEnum = TestUtil.Docs(Random(), te, null, docsEnum, DocsEnum.FLAG_NONE); + int docID = docsEnum.NextDoc(); + Assert.IsTrue(docID != DocIdSetIterator.NO_MORE_DOCS); + Assert.AreEqual(docIDToID.Get(docID), (int)termToID[expected]); + do + { + loc++; + } while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc])); + } + Assert.IsNull(te.Next()); + } + } + + r.Dispose(); + dir.Dispose(); + } + + private readonly string FIELD = "field"; + + private IndexReader MakeIndex(Directory d, params string[] terms) + { + var iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); + + /* + iwc.SetCodec(new StandardCodec(minTermsInBlock, maxTermsInBlock)); + */ + + using (var w = new RandomIndexWriter(Random(), d, iwc)) + { + foreach (string term in terms) + { + var doc = new Document(); + var f = NewStringField(FIELD, term, Field.Store.NO); + doc.Add(f); + w.AddDocument(doc); + } + + return w.Reader; + } + } + + private int DocFreq(IndexReader r, string term) + { + return r.DocFreq(new Term(FIELD, term)); + } + + [Test] + public virtual void TestEasy() + { + // No floor arcs: + using (var d = NewDirectory()) + using (var r = MakeIndex(d, "aa0", "aa1", "aa2", "aa3", "bb0", "bb1", "bb2", "bb3", "aa")) + { + // First term in block: + Assert.AreEqual(1, DocFreq(r, "aa0")); + + // Scan forward to another term in same block + Assert.AreEqual(1, DocFreq(r, "aa2")); + + Assert.AreEqual(1, DocFreq(r, "aa")); + + // Reset same block then scan forwards + Assert.AreEqual(1, DocFreq(r, "aa1")); + + // Not found, in same block + Assert.AreEqual(0, DocFreq(r, "aa5")); + + // Found, in same block + Assert.AreEqual(1, DocFreq(r, "aa2")); + + // Not found in index: + Assert.AreEqual(0, DocFreq(r, "b0")); + + // Found: + Assert.AreEqual(1, DocFreq(r, "aa2")); + + // Found, rewind: + Assert.AreEqual(1, DocFreq(r, "aa0")); + + // First term in block: + Assert.AreEqual(1, DocFreq(r, "bb0")); + + // Scan forward to another term in same block + Assert.AreEqual(1, DocFreq(r, "bb2")); + + // Reset same block then scan forwards + Assert.AreEqual(1, DocFreq(r, "bb1")); + + // Not found, in same block + Assert.AreEqual(0, DocFreq(r, "bb5")); + + // Found, in same block + Assert.AreEqual(1, DocFreq(r, "bb2")); + + // Not found in index: + Assert.AreEqual(0, DocFreq(r, "b0")); + + // Found: + Assert.AreEqual(1, DocFreq(r, "bb2")); + + // Found, rewind: + Assert.AreEqual(1, DocFreq(r, "bb0")); + } + } + + // tests: + // - test same prefix has non-floor block and floor block (ie, has 2 long outputs on same term prefix) + // - term that's entirely in the index + + [Test] + public virtual void TestFloorBlocks() + { + var terms = new[] { "aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9", "aa", "xx" }; + + using (var d = NewDirectory()) + using (var r = MakeIndex(d, terms)) + { + // First term in first block: + Assert.AreEqual(1, DocFreq(r, "aa0")); + Assert.AreEqual(1, DocFreq(r, "aa4")); + + // No block + Assert.AreEqual(0, DocFreq(r, "bb0")); + + // Second block + Assert.AreEqual(1, DocFreq(r, "aa4")); + + // Backwards to prior floor block: + Assert.AreEqual(1, DocFreq(r, "aa0")); + + // Forwards to last floor block: + Assert.AreEqual(1, DocFreq(r, "aa9")); + + Assert.AreEqual(0, DocFreq(r, "a")); + Assert.AreEqual(1, DocFreq(r, "aa")); + Assert.AreEqual(0, DocFreq(r, "a")); + Assert.AreEqual(1, DocFreq(r, "aa")); + + // Forwards to last floor block: + Assert.AreEqual(1, DocFreq(r, "xx")); + Assert.AreEqual(1, DocFreq(r, "aa1")); + Assert.AreEqual(0, DocFreq(r, "yy")); + + Assert.AreEqual(1, DocFreq(r, "xx")); + Assert.AreEqual(1, DocFreq(r, "aa9")); + + Assert.AreEqual(1, DocFreq(r, "xx")); + Assert.AreEqual(1, DocFreq(r, "aa4")); + + TermsEnum te = MultiFields.GetTerms(r, FIELD).GetIterator(null); + while (te.Next() != null) + { + //System.out.println("TEST: next term=" + te.Term().Utf8ToString()); + } + + Assert.IsTrue(SeekExact(te, "aa1")); + Assert.AreEqual("aa2", Next(te)); + Assert.IsTrue(SeekExact(te, "aa8")); + Assert.AreEqual("aa9", Next(te)); + Assert.AreEqual("xx", Next(te)); + + TestRandomSeeks(r, terms); + } + } + + [Test] + public virtual void TestZeroTerms() + { + var d = NewDirectory(); + RandomIndexWriter w = new RandomIndexWriter(Random(), d, Similarity, TimeZone); + Document doc = new Document(); + doc.Add(NewTextField("field", "one two three", Field.Store.NO)); + doc = new Document(); + doc.Add(NewTextField("field2", "one two three", Field.Store.NO)); + w.AddDocument(doc); + w.Commit(); + w.DeleteDocuments(new Term("field", "one")); + w.ForceMerge(1); + IndexReader r = w.Reader; + w.Dispose(); + Assert.AreEqual(1, r.NumDocs); + Assert.AreEqual(1, r.MaxDoc); + Terms terms = MultiFields.GetTerms(r, "field"); + if (terms != null) + { + Assert.IsNull(terms.GetIterator(null).Next()); + } + r.Dispose(); + d.Dispose(); + } + + private string RandomString + { + get + { + //return TestUtil.RandomSimpleString(Random()); + return TestUtil.RandomRealisticUnicodeString(Random()); + } + } + + [Test] + public virtual void TestRandomTerms() + { + var terms = new string[TestUtil.NextInt(Random(), 1, AtLeast(1000))]; + var seen = new HashSet<string>(); + + var allowEmptyString = Random().NextBoolean(); + + if (Random().Next(10) == 7 && terms.Length > 2) + { + // Sometimes add a bunch of terms sharing a longish common prefix: + int numTermsSamePrefix = Random().Next(terms.Length / 2); + if (numTermsSamePrefix > 0) + { + string prefix; + while (true) + { + prefix = RandomString; + if (prefix.Length < 5) + { + continue; + } + else + { + break; + } + } + while (seen.Count < numTermsSamePrefix) + { + string t = prefix + RandomString; + if (!seen.Contains(t)) + { + terms[seen.Count] = t; + seen.Add(t); + } + } + } + } + + while (seen.Count < terms.Length) + { + string t = RandomString; + if (!seen.Contains(t) && (allowEmptyString || t.Length != 0)) + { + terms[seen.Count] = t; + seen.Add(t); + } + } + + using (var d = NewDirectory()) + using (var r = MakeIndex(d, terms)) + { + TestRandomSeeks(r, terms); + } + } + + // sugar + private bool SeekExact(TermsEnum te, string term) + { + return te.SeekExact(new BytesRef(term)); + } + + // sugar + private string Next(TermsEnum te) + { + BytesRef br = te.Next(); + if (br == null) + { + return null; + } + else + { + return br.Utf8ToString(); + } + } + + private BytesRef GetNonExistTerm(BytesRef[] terms) + { + BytesRef t = null; + while (true) + { + string ts = RandomString; + t = new BytesRef(ts); + if (Array.BinarySearch(terms, t) < 0) + { + return t; + } + } + } + + private class TermAndState + { + public readonly BytesRef Term; + public readonly TermState State; + + public TermAndState(BytesRef term, TermState state) + { + this.Term = term; + this.State = state; + } + } + + private void TestRandomSeeks(IndexReader r, params string[] validTermStrings) + { + BytesRef[] validTerms = new BytesRef[validTermStrings.Length]; + for (int termIDX = 0; termIDX < validTermStrings.Length; termIDX++) + { + validTerms[termIDX] = new BytesRef(validTermStrings[termIDX]); + } + Array.Sort(validTerms); + if (VERBOSE) + { + Console.WriteLine("TEST: " + validTerms.Length + " terms:"); + foreach (BytesRef t in validTerms) + { + Console.WriteLine(" " + t.Utf8ToString() + " " + t); + } + } + TermsEnum te = MultiFields.GetTerms(r, FIELD).GetIterator(null); + + int END_LOC = -validTerms.Length - 1; + + IList<TermAndState> termStates = new List<TermAndState>(); + + for (int iter = 0; iter < 100 * RANDOM_MULTIPLIER; iter++) + { + BytesRef t; + int loc; + TermState termState; + if (Random().Next(6) == 4) + { + // pick term that doens't exist: + t = GetNonExistTerm(validTerms); + termState = null; + if (VERBOSE) + { + Console.WriteLine("\nTEST: invalid term=" + t.Utf8ToString()); + } + loc = Array.BinarySearch(validTerms, t); + } + else if (termStates.Count != 0 && Random().Next(4) == 1) + { + TermAndState ts = termStates[Random().Next(termStates.Count)]; + t = ts.Term; + loc = Array.BinarySearch(validTerms, t); + Assert.IsTrue(loc >= 0); + termState = ts.State; + if (VERBOSE) + { + Console.WriteLine("\nTEST: valid termState term=" + t.Utf8ToString()); + } + } + else + { + // pick valid term + loc = Random().Next(validTerms.Length); + t = BytesRef.DeepCopyOf(validTerms[loc]); + termState = null; + if (VERBOSE) + { + Console.WriteLine("\nTEST: valid term=" + t.Utf8ToString()); + } + } + + // seekCeil or seekExact: + bool doSeekExact = Random().NextBoolean(); + if (termState != null) + { + if (VERBOSE) + { + Console.WriteLine(" seekExact termState"); + } + te.SeekExact(t, termState); + } + else if (doSeekExact) + { + if (VERBOSE) + { + Console.WriteLine(" seekExact"); + } + Assert.AreEqual(loc >= 0, te.SeekExact(t)); + } + else + { + if (VERBOSE) + { + Console.WriteLine(" seekCeil"); + } + + TermsEnum.SeekStatus result = te.SeekCeil(t); + if (VERBOSE) + { + Console.WriteLine(" got " + result); + } + + if (loc >= 0) + { + Assert.AreEqual(TermsEnum.SeekStatus.FOUND, result); + } + else if (loc == END_LOC) + { + Assert.AreEqual(TermsEnum.SeekStatus.END, result); + } + else + { + Debug.Assert(loc >= -validTerms.Length); + Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, result); + } + } + + if (loc >= 0) + { + Assert.AreEqual(t, te.Term); + } + else if (doSeekExact) + { + // TermsEnum is unpositioned if seekExact returns false + continue; + } + else if (loc == END_LOC) + { + continue; + } + else + { + loc = -loc - 1; + Assert.AreEqual(validTerms[loc], te.Term); + } + + // Do a bunch of next's after the seek + int numNext = Random().Next(validTerms.Length); + + for (int nextCount = 0; nextCount < numNext; nextCount++) + { + if (VERBOSE) + { + Console.WriteLine("\nTEST: next loc=" + loc + " of " + validTerms.Length); + } + BytesRef t2 = te.Next(); + loc++; + if (loc == validTerms.Length) + { + Assert.IsNull(t2); + break; + } + else + { + Assert.AreEqual(validTerms[loc], t2); + if (Random().Next(40) == 17 && termStates.Count < 100) + { + termStates.Add(new TermAndState(validTerms[loc], te.GetTermState())); + } + } + } + } + } + + [Test] + public virtual void TestIntersectBasic() + { + Directory dir = NewDirectory(); + IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); + iwc.SetMergePolicy(new LogDocMergePolicy()); + RandomIndexWriter w = new RandomIndexWriter(Random(), dir, iwc); + Document doc = new Document(); + doc.Add(NewTextField("field", "aaa", Field.Store.NO)); + w.AddDocument(doc); + + doc = new Document(); + doc.Add(NewStringField("field", "bbb", Field.Store.NO)); + w.AddDocument(doc); + + doc = new Document(); + doc.Add(NewTextField("field", "ccc", Field.Store.NO)); + w.AddDocument(doc); + + w.ForceMerge(1); + DirectoryReader r = w.Reader; + w.Dispose(); + AtomicReader sub = GetOnlySegmentReader(r); + Terms terms = sub.Fields.GetTerms("field"); + Automaton automaton = (new RegExp(".*", RegExp.NONE)).ToAutomaton(); + CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false); + TermsEnum te = terms.Intersect(ca, null); + Assert.AreEqual("aaa", te.Next().Utf8ToString()); + Assert.AreEqual(0, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); + Assert.AreEqual("bbb", te.Next().Utf8ToString()); + Assert.AreEqual(1, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); + Assert.AreEqual("ccc", te.Next().Utf8ToString()); + Assert.AreEqual(2, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); + Assert.IsNull(te.Next()); + + te = terms.Intersect(ca, new BytesRef("abc")); + Assert.AreEqual("bbb", te.Next().Utf8ToString()); + Assert.AreEqual(1, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); + Assert.AreEqual("ccc", te.Next().Utf8ToString()); + Assert.AreEqual(2, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); + Assert.IsNull(te.Next()); + + te = terms.Intersect(ca, new BytesRef("aaa")); + Assert.AreEqual("bbb", te.Next().Utf8ToString()); + Assert.AreEqual(1, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); + Assert.AreEqual("ccc", te.Next().Utf8ToString()); + Assert.AreEqual(2, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); + Assert.IsNull(te.Next()); + + r.Dispose(); + dir.Dispose(); + } + + [Test] + public virtual void TestIntersectStartTerm() + { + Directory dir = NewDirectory(); + IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); + iwc.SetMergePolicy(new LogDocMergePolicy()); + + RandomIndexWriter w = new RandomIndexWriter(Random(), dir, iwc); + Document doc = new Document(); + doc.Add(NewStringField("field", "abc", Field.Store.NO)); + w.AddDocument(doc); + + doc = new Document(); + doc.Add(NewStringField("field", "abd", Field.Store.NO)); + w.AddDocument(doc); + + doc = new Document(); + doc.Add(NewStringField("field", "acd", Field.Store.NO)); + w.AddDocument(doc); + + doc = new Document(); + doc.Add(NewStringField("field", "bcd", Field.Store.NO)); + w.AddDocument(doc); + + w.ForceMerge(1); + DirectoryReader r = w.Reader; + w.Dispose(); + AtomicReader sub = GetOnlySegmentReader(r); + Terms terms = sub.Fields.GetTerms("field"); + + Automaton automaton = (new RegExp(".*d", RegExp.NONE)).ToAutomaton(); + CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false); + TermsEnum te; + + // should seek to startTerm + te = terms.Intersect(ca, new BytesRef("aad")); + Assert.AreEqual("abd", te.Next().Utf8ToString()); + Assert.AreEqual(1, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); + Assert.AreEqual("acd", te.Next().Utf8ToString()); + Assert.AreEqual(2, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); + Assert.AreEqual("bcd", te.Next().Utf8ToString()); + Assert.AreEqual(3, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); + Assert.IsNull(te.Next()); + + // should fail to find ceil label on second arc, rewind + te = terms.Intersect(ca, new BytesRef("add")); + Assert.AreEqual("bcd", te.Next().Utf8ToString()); + Assert.AreEqual(3, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); + Assert.IsNull(te.Next()); + + // should reach end + te = terms.Intersect(ca, new BytesRef("bcd")); + Assert.IsNull(te.Next()); + te = terms.Intersect(ca, new BytesRef("ddd")); + Assert.IsNull(te.Next()); + + r.Dispose(); + dir.Dispose(); + } + + [Test] + public virtual void TestIntersectEmptyString() + { + Directory dir = NewDirectory(); + IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); + iwc.SetMergePolicy(new LogDocMergePolicy()); + RandomIndexWriter w = new RandomIndexWriter(Random(), dir, iwc); + Document doc = new Document(); + doc.Add(NewStringField("field", "", Field.Store.NO)); + doc.Add(NewStringField("field", "abc", Field.Store.NO)); + w.AddDocument(doc); + + doc = new Document(); + // add empty string to both documents, so that singletonDocID == -1. + // For a FST-based term dict, we'll expect to see the first arc is + // flaged with HAS_FINAL_OUTPUT + doc.Add(NewStringField("field", "abc", Field.Store.NO)); + doc.Add(NewStringField("field", "", Field.Store.NO)); + w.AddDocument(doc); + + w.ForceMerge(1); + DirectoryReader r = w.Reader; + w.Dispose(); + AtomicReader sub = GetOnlySegmentReader(r); + Terms terms = sub.Fields.GetTerms("field"); + + Automaton automaton = (new RegExp(".*", RegExp.NONE)).ToAutomaton(); // accept ALL + CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false); + + TermsEnum te = terms.Intersect(ca, null); + DocsEnum de; + + Assert.AreEqual("", te.Next().Utf8ToString()); + de = te.Docs(null, null, DocsEnum.FLAG_NONE); + Assert.AreEqual(0, de.NextDoc()); + Assert.AreEqual(1, de.NextDoc()); + + Assert.AreEqual("abc", te.Next().Utf8ToString()); + de = te.Docs(null, null, DocsEnum.FLAG_NONE); + Assert.AreEqual(0, de.NextDoc()); + Assert.AreEqual(1, de.NextDoc()); + + Assert.IsNull(te.Next()); + + // pass empty string + te = terms.Intersect(ca, new BytesRef("")); + + Assert.AreEqual("abc", te.Next().Utf8ToString()); + de = te.Docs(null, null, DocsEnum.FLAG_NONE); + Assert.AreEqual(0, de.NextDoc()); + Assert.AreEqual(1, de.NextDoc()); + + Assert.IsNull(te.Next()); + + r.Dispose(); + dir.Dispose(); + } + } +} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Index/TestTermsEnum2.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Index/TestTermsEnum2.cs b/src/Lucene.Net.Tests/Index/TestTermsEnum2.cs new file mode 100644 index 0000000..734cfe0 --- /dev/null +++ b/src/Lucene.Net.Tests/Index/TestTermsEnum2.cs @@ -0,0 +1,204 @@ +using System.Collections.Generic; +using Lucene.Net.Documents; + +namespace Lucene.Net.Index +{ + using Lucene.Net.Randomized.Generators; + using Lucene.Net.Support; + using Lucene.Net.Util.Automaton; + using NUnit.Framework; + using AutomatonQuery = Lucene.Net.Search.AutomatonQuery; + using BytesRef = Lucene.Net.Util.BytesRef; + using CheckHits = Lucene.Net.Search.CheckHits; + using Codec = Lucene.Net.Codecs.Codec; + using Directory = Lucene.Net.Store.Directory; + using Document = Documents.Document; + using Field = Field; + using IndexSearcher = Lucene.Net.Search.IndexSearcher; + using LuceneTestCase = Lucene.Net.Util.LuceneTestCase; + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using MockAnalyzer = Lucene.Net.Analysis.MockAnalyzer; + using MockTokenizer = Lucene.Net.Analysis.MockTokenizer; + using SeekStatus = Lucene.Net.Index.TermsEnum.SeekStatus; + using TestUtil = Lucene.Net.Util.TestUtil; + + [TestFixture] + public class TestTermsEnum2 : LuceneTestCase + { + private Directory Dir; + private IndexReader Reader; + private IndexSearcher Searcher; + private SortedSet<BytesRef> Terms; // the terms we put in the index + private Automaton TermsAutomaton; // automata of the same + internal int NumIterations; + + [SetUp] + public override void SetUp() + { + base.SetUp(); + // we generate aweful regexps: good for testing. + // but for preflex codec, the test can be very slow, so use less iterations. + NumIterations = Codec.Default.Name.Equals("Lucene3x") ? 10 * RANDOM_MULTIPLIER : AtLeast(50); + Dir = NewDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(Random(), Dir, (IndexWriterConfig)NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.KEYWORD, false)).SetMaxBufferedDocs(TestUtil.NextInt(Random(), 50, 1000))); + Document doc = new Document(); + Field field = NewStringField("field", "", Field.Store.YES); + doc.Add(field); + Terms = new SortedSet<BytesRef>(); + + int num = AtLeast(200); + for (int i = 0; i < num; i++) + { + string s = TestUtil.RandomUnicodeString(Random()); + field.SetStringValue(s); + Terms.Add(new BytesRef(s)); + writer.AddDocument(doc); + } + + TermsAutomaton = BasicAutomata.MakeStringUnion(Terms); + + Reader = writer.Reader; + Searcher = NewSearcher(Reader); + writer.Dispose(); + } + + [TearDown] + public override void TearDown() + { + Reader.Dispose(); + Dir.Dispose(); + base.TearDown(); + } + + /// <summary> + /// tests a pre-intersected automaton against the original </summary> + [Test] + public virtual void TestFiniteVersusInfinite() + { + for (int i = 0; i < NumIterations; i++) + { + string reg = AutomatonTestUtil.RandomRegexp(Random()); + Automaton automaton = (new RegExp(reg, RegExp.NONE)).ToAutomaton(); + IList<BytesRef> matchedTerms = new List<BytesRef>(); + foreach (BytesRef t in Terms) + { + if (BasicOperations.Run(automaton, t.Utf8ToString())) + { + matchedTerms.Add(t); + } + } + + Automaton alternate = BasicAutomata.MakeStringUnion(matchedTerms); + //System.out.println("match " + matchedTerms.Size() + " " + alternate.getNumberOfStates() + " states, sigma=" + alternate.getStartPoints().Length); + //AutomatonTestUtil.minimizeSimple(alternate); + //System.out.println("minmize done"); + AutomatonQuery a1 = new AutomatonQuery(new Term("field", ""), automaton); + AutomatonQuery a2 = new AutomatonQuery(new Term("field", ""), alternate); + CheckHits.CheckEqual(a1, Searcher.Search(a1, 25).ScoreDocs, Searcher.Search(a2, 25).ScoreDocs); + } + } + + /// <summary> + /// seeks to every term accepted by some automata </summary> + [Test] + public virtual void TestSeeking() + { + for (int i = 0; i < NumIterations; i++) + { + string reg = AutomatonTestUtil.RandomRegexp(Random()); + Automaton automaton = (new RegExp(reg, RegExp.NONE)).ToAutomaton(); + TermsEnum te = MultiFields.GetTerms(Reader, "field").GetIterator(null); + IList<BytesRef> unsortedTerms = new List<BytesRef>(Terms); + Collections.Shuffle(unsortedTerms); + + foreach (BytesRef term in unsortedTerms) + { + if (BasicOperations.Run(automaton, term.Utf8ToString())) + { + // term is accepted + if (Random().NextBoolean()) + { + // seek exact + Assert.IsTrue(te.SeekExact(term)); + } + else + { + // seek ceil + Assert.AreEqual(SeekStatus.FOUND, te.SeekCeil(term)); + Assert.AreEqual(term, te.Term); + } + } + } + } + } + + /// <summary> + /// mixes up seek and next for all terms </summary> + [Test] + public virtual void TestSeekingAndNexting() + { + for (int i = 0; i < NumIterations; i++) + { + TermsEnum te = MultiFields.GetTerms(Reader, "field").GetIterator(null); + + foreach (BytesRef term in Terms) + { + int c = Random().Next(3); + if (c == 0) + { + Assert.AreEqual(term, te.Next()); + } + else if (c == 1) + { + Assert.AreEqual(SeekStatus.FOUND, te.SeekCeil(term)); + Assert.AreEqual(term, te.Term); + } + else + { + Assert.IsTrue(te.SeekExact(term)); + } + } + } + } + + /// <summary> + /// tests intersect: TODO start at a random term! </summary> + [Test] + public virtual void TestIntersect() + { + for (int i = 0; i < NumIterations; i++) + { + string reg = AutomatonTestUtil.RandomRegexp(Random()); + Automaton automaton = (new RegExp(reg, RegExp.NONE)).ToAutomaton(); + CompiledAutomaton ca = new CompiledAutomaton(automaton, SpecialOperations.IsFinite(automaton), false); + TermsEnum te = MultiFields.GetTerms(Reader, "field").Intersect(ca, null); + Automaton expected = BasicOperations.Intersection(TermsAutomaton, automaton); + SortedSet<BytesRef> found = new SortedSet<BytesRef>(); + while (te.Next() != null) + { + found.Add(BytesRef.DeepCopyOf(te.Term)); + } + + Automaton actual = BasicAutomata.MakeStringUnion(found); + Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Index/TestThreadedForceMerge.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Index/TestThreadedForceMerge.cs b/src/Lucene.Net.Tests/Index/TestThreadedForceMerge.cs new file mode 100644 index 0000000..e4e2d59 --- /dev/null +++ b/src/Lucene.Net.Tests/Index/TestThreadedForceMerge.cs @@ -0,0 +1,183 @@ +using System; +using System.Threading; +using Lucene.Net.Documents; + +namespace Lucene.Net.Index +{ + using Lucene.Net.Support; + using NUnit.Framework; + using Analyzer = Lucene.Net.Analysis.Analyzer; + using Directory = Lucene.Net.Store.Directory; + using Document = Documents.Document; + using English = Lucene.Net.Util.English; + using FieldType = FieldType; + using LuceneTestCase = Lucene.Net.Util.LuceneTestCase; + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using MockAnalyzer = Lucene.Net.Analysis.MockAnalyzer; + using MockTokenizer = Lucene.Net.Analysis.MockTokenizer; + using StringField = StringField; + + [TestFixture] + public class TestThreadedForceMerge : LuceneTestCase + { + private static Analyzer ANALYZER; + + private const int NUM_THREADS = 3; + //private final static int NUM_THREADS = 5; + + private const int NUM_ITER = 1; + + private const int NUM_ITER2 = 1; + + private volatile bool Failed; + + [SetUp] + public static void Setup() + { + ANALYZER = new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true); + } + + private void SetFailed() + { + Failed = true; + } + + public virtual void RunTest(Random random, Directory directory) + { + IndexWriter writer = new IndexWriter(directory, ((IndexWriterConfig)NewIndexWriterConfig(TEST_VERSION_CURRENT, ANALYZER).SetOpenMode(OpenMode.CREATE).SetMaxBufferedDocs(2)).SetMergePolicy(NewLogMergePolicy())); + + for (int iter = 0; iter < NUM_ITER; iter++) + { + int iterFinal = iter; + + ((LogMergePolicy)writer.Config.MergePolicy).MergeFactor = 1000; + + FieldType customType = new FieldType(StringField.TYPE_STORED); + customType.OmitNorms = true; + + for (int i = 0; i < 200; i++) + { + Document d = new Document(); + d.Add(NewField("id", Convert.ToString(i), customType)); + d.Add(NewField("contents", English.IntToEnglish(i), customType)); + writer.AddDocument(d); + } + + ((LogMergePolicy)writer.Config.MergePolicy).MergeFactor = 4; + + ThreadClass[] threads = new ThreadClass[NUM_THREADS]; + + for (int i = 0; i < NUM_THREADS; i++) + { + int iFinal = i; + IndexWriter writerFinal = writer; + threads[i] = new ThreadAnonymousInnerClassHelper(this, iterFinal, customType, iFinal, writerFinal); + } + + for (int i = 0; i < NUM_THREADS; i++) + { + threads[i].Start(); + } + + for (int i = 0; i < NUM_THREADS; i++) + { + threads[i].Join(); + } + + Assert.IsTrue(!Failed); + + int expectedDocCount = (int)((1 + iter) * (200 + 8 * NUM_ITER2 * (NUM_THREADS / 2.0) * (1 + NUM_THREADS))); + + Assert.AreEqual(expectedDocCount, writer.NumDocs, "index=" + writer.SegString() + " numDocs=" + writer.NumDocs + " maxDoc=" + writer.MaxDoc + " config=" + writer.Config); + Assert.AreEqual(expectedDocCount, writer.MaxDoc, "index=" + writer.SegString() + " numDocs=" + writer.NumDocs + " maxDoc=" + writer.MaxDoc + " config=" + writer.Config); + + writer.Dispose(); + writer = new IndexWriter(directory, (IndexWriterConfig)NewIndexWriterConfig(TEST_VERSION_CURRENT, ANALYZER).SetOpenMode(OpenMode.APPEND).SetMaxBufferedDocs(2)); + + DirectoryReader reader = DirectoryReader.Open(directory); + Assert.AreEqual(1, reader.Leaves.Count, "reader=" + reader); + Assert.AreEqual(expectedDocCount, reader.NumDocs); + reader.Dispose(); + } + writer.Dispose(); + } + + private class ThreadAnonymousInnerClassHelper : ThreadClass + { + private readonly TestThreadedForceMerge OuterInstance; + + private int IterFinal; + private FieldType CustomType; + private int IFinal; + private IndexWriter WriterFinal; + + public ThreadAnonymousInnerClassHelper(TestThreadedForceMerge outerInstance, int iterFinal, FieldType customType, int iFinal, IndexWriter writerFinal) + { + this.OuterInstance = outerInstance; + this.IterFinal = iterFinal; + this.CustomType = customType; + this.IFinal = iFinal; + this.WriterFinal = writerFinal; + } + + public override void Run() + { + try + { + for (int j = 0; j < NUM_ITER2; j++) + { + WriterFinal.ForceMerge(1, false); + for (int k = 0; k < 17 * (1 + IFinal); k++) + { + Document d = new Document(); + d.Add(OuterInstance.NewField("id", IterFinal + "_" + IFinal + "_" + j + "_" + k, CustomType)); + d.Add(OuterInstance.NewField("contents", English.IntToEnglish(IFinal + k), CustomType)); + WriterFinal.AddDocument(d); + } + for (int k = 0; k < 9 * (1 + IFinal); k++) + { + WriterFinal.DeleteDocuments(new Term("id", IterFinal + "_" + IFinal + "_" + j + "_" + k)); + } + WriterFinal.ForceMerge(1); + } + } + catch (Exception t) + { + OuterInstance.SetFailed(); + Console.WriteLine(Thread.CurrentThread.Name + ": hit exception"); + Console.WriteLine(t.StackTrace); + } + } + } + + /* + Run above stress test against RAMDirectory and then + FSDirectory. + */ + + [Test] + public virtual void TestThreadedForceMerge_Mem() + { + Directory directory = NewDirectory(); + RunTest(Random(), directory); + directory.Dispose(); + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Index/TestTieredMergePolicy.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Index/TestTieredMergePolicy.cs b/src/Lucene.Net.Tests/Index/TestTieredMergePolicy.cs new file mode 100644 index 0000000..793736d --- /dev/null +++ b/src/Lucene.Net.Tests/Index/TestTieredMergePolicy.cs @@ -0,0 +1,297 @@ +using Lucene.Net.Attributes; +using Lucene.Net.Documents; +using Lucene.Net.Store; +using NUnit.Framework; +using System; + +namespace Lucene.Net.Index +{ + using Directory = Lucene.Net.Store.Directory; + using Document = Documents.Document; + using Field = Field; + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using MockAnalyzer = Lucene.Net.Analysis.MockAnalyzer; + using TestUtil = Lucene.Net.Util.TestUtil; + + [TestFixture] + public class TestTieredMergePolicy : BaseMergePolicyTestCase + { + protected internal override MergePolicy MergePolicy() + { + return NewTieredMergePolicy(); + } + + [Test, LuceneNetSpecific] + public virtual void TestIndexWriterDirtSimple() + { + Directory dir = new RAMDirectory(); + IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); + TieredMergePolicy tmp = NewTieredMergePolicy(); + iwc.SetMergePolicy(tmp); + iwc.SetMaxBufferedDocs(2); + tmp.MaxMergeAtOnce = 100; + tmp.SegmentsPerTier = 100; + tmp.ForceMergeDeletesPctAllowed = 30.0; + IndexWriter w = new IndexWriter(dir, iwc); + + int numDocs = 2; + + for (int i = 0; i < numDocs; i++) + { + Document doc = new Document(); + doc.Add(NewTextField("content", "aaa " + i, Field.Store.NO)); + w.AddDocument(doc); + } + + Assert.AreEqual(numDocs, w.MaxDoc); + Assert.AreEqual(numDocs, w.NumDocs); + } + + [Test] + public virtual void TestForceMergeDeletes() + { + Directory dir = NewDirectory(); + IndexWriterConfig conf = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); + TieredMergePolicy tmp = NewTieredMergePolicy(); + conf.SetMergePolicy(tmp); + conf.SetMaxBufferedDocs(4); + tmp.MaxMergeAtOnce = 100; + tmp.SegmentsPerTier = 100; + tmp.ForceMergeDeletesPctAllowed = 30.0; + IndexWriter w = new IndexWriter(dir, conf); + for (int i = 0; i < 80; i++) + { + Document doc = new Document(); + doc.Add(NewTextField("content", "aaa " + (i % 4), Field.Store.NO)); + w.AddDocument(doc); + } + Assert.AreEqual(80, w.MaxDoc); + Assert.AreEqual(80, w.NumDocs); + + if (VERBOSE) + { + Console.WriteLine("\nTEST: delete docs"); + } + w.DeleteDocuments(new Term("content", "0")); + w.ForceMergeDeletes(); + + Assert.AreEqual(80, w.MaxDoc); + Assert.AreEqual(60, w.NumDocs); + + if (VERBOSE) + { + Console.WriteLine("\nTEST: forceMergeDeletes2"); + } + ((TieredMergePolicy)w.Config.MergePolicy).ForceMergeDeletesPctAllowed = 10.0; + w.ForceMergeDeletes(); + Assert.AreEqual(60, w.NumDocs); + Assert.AreEqual(60, w.MaxDoc); + w.Dispose(); + dir.Dispose(); + } + + [Test] + public virtual void TestPartialMerge() + { + int num = AtLeast(10); + for (int iter = 0; iter < num; iter++) + { + if (VERBOSE) + { + Console.WriteLine("TEST: iter=" + iter); + } + Directory dir = NewDirectory(); + IndexWriterConfig conf = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); + conf.SetMergeScheduler(new SerialMergeScheduler()); + TieredMergePolicy tmp = NewTieredMergePolicy(); + conf.SetMergePolicy(tmp); + conf.SetMaxBufferedDocs(2); + tmp.MaxMergeAtOnce = 3; + tmp.SegmentsPerTier = 6; + + IndexWriter w = new IndexWriter(dir, conf); + int maxCount = 0; + int numDocs = TestUtil.NextInt(Random(), 20, 100); + for (int i = 0; i < numDocs; i++) + { + Document doc = new Document(); + doc.Add(NewTextField("content", "aaa " + (i % 4), Field.Store.NO)); + w.AddDocument(doc); + int count = w.SegmentCount; + maxCount = Math.Max(count, maxCount); + Assert.IsTrue(count >= maxCount - 3, "count=" + count + " maxCount=" + maxCount); + } + + w.Flush(true, true); + + int segmentCount = w.SegmentCount; + int targetCount = TestUtil.NextInt(Random(), 1, segmentCount); + if (VERBOSE) + { + Console.WriteLine("TEST: merge to " + targetCount + " segs (current count=" + segmentCount + ")"); + } + w.ForceMerge(targetCount); + Assert.AreEqual(targetCount, w.SegmentCount); + + w.Dispose(); + dir.Dispose(); + } + } + + [Test] + public virtual void TestForceMergeDeletesMaxSegSize() + { + Directory dir = NewDirectory(); + IndexWriterConfig conf = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); + TieredMergePolicy tmp = new TieredMergePolicy(); + tmp.MaxMergedSegmentMB = 0.01; + tmp.ForceMergeDeletesPctAllowed = 0.0; + conf.SetMergePolicy(tmp); + + RandomIndexWriter w = new RandomIndexWriter(Random(), dir, conf); + w.RandomForceMerge = false; + + int numDocs = AtLeast(200); + for (int i = 0; i < numDocs; i++) + { + Document doc = new Document(); + doc.Add(NewStringField("id", "" + i, Field.Store.NO)); + doc.Add(NewTextField("content", "aaa " + i, Field.Store.NO)); + w.AddDocument(doc); + } + + w.ForceMerge(1); + IndexReader r = w.Reader; + Assert.AreEqual(numDocs, r.MaxDoc); + Assert.AreEqual(numDocs, r.NumDocs); + r.Dispose(); + + if (VERBOSE) + { + Console.WriteLine("\nTEST: delete doc"); + } + + w.DeleteDocuments(new Term("id", "" + (42 + 17))); + + r = w.Reader; + Assert.AreEqual(numDocs, r.MaxDoc); + Assert.AreEqual(numDocs - 1, r.NumDocs); + r.Dispose(); + + w.ForceMergeDeletes(); + + r = w.Reader; + Assert.AreEqual(numDocs - 1, r.MaxDoc); + Assert.AreEqual(numDocs - 1, r.NumDocs); + r.Dispose(); + + w.Dispose(); + + dir.Dispose(); + } + + private const double EPSILON = 1E-14; + + [Test] + public virtual void TestSetters() + { + TieredMergePolicy tmp = new TieredMergePolicy(); + + tmp.MaxMergedSegmentMB = 0.5; + Assert.AreEqual(0.5, tmp.MaxMergedSegmentMB, EPSILON); + + tmp.MaxMergedSegmentMB = double.PositiveInfinity; + Assert.AreEqual(long.MaxValue / 1024 / 1024.0, tmp.MaxMergedSegmentMB, EPSILON * long.MaxValue); + + tmp.MaxMergedSegmentMB = long.MaxValue / 1024 / 1024.0; + Assert.AreEqual(long.MaxValue / 1024 / 1024.0, tmp.MaxMergedSegmentMB, EPSILON * long.MaxValue); + + try + { + tmp.MaxMergedSegmentMB = -2.0; + Assert.Fail("Didn't throw IllegalArgumentException"); + } +#pragma warning disable 168 + catch (System.ArgumentException iae) +#pragma warning restore 168 + { + // pass + } + + tmp.FloorSegmentMB = 2.0; + Assert.AreEqual(2.0, tmp.FloorSegmentMB, EPSILON); + + tmp.FloorSegmentMB = double.PositiveInfinity; + Assert.AreEqual(long.MaxValue / 1024 / 1024.0, tmp.FloorSegmentMB, EPSILON * long.MaxValue); + + tmp.FloorSegmentMB = long.MaxValue / 1024 / 1024.0; + Assert.AreEqual(long.MaxValue / 1024 / 1024.0, tmp.FloorSegmentMB, EPSILON * long.MaxValue); + + try + { + tmp.FloorSegmentMB = -2.0; + Assert.Fail("Didn't throw IllegalArgumentException"); + } +#pragma warning disable 168 + catch (System.ArgumentException iae) +#pragma warning restore 168 + { + // pass + } + + tmp.MaxCFSSegmentSizeMB = 2.0; + Assert.AreEqual(2.0, tmp.MaxCFSSegmentSizeMB, EPSILON); + + tmp.MaxCFSSegmentSizeMB = double.PositiveInfinity; + Assert.AreEqual(long.MaxValue / 1024 / 1024.0, tmp.MaxCFSSegmentSizeMB, EPSILON * long.MaxValue); + + tmp.MaxCFSSegmentSizeMB = long.MaxValue / 1024 / 1024.0; + Assert.AreEqual(long.MaxValue / 1024 / 1024.0, tmp.MaxCFSSegmentSizeMB, EPSILON * long.MaxValue); + + try + { + tmp.MaxCFSSegmentSizeMB = -2.0; + Assert.Fail("Didn't throw IllegalArgumentException"); + } +#pragma warning disable 168 + catch (System.ArgumentException iae) +#pragma warning restore 168 + { + // pass + } + + // TODO: Add more checks for other non-double setters! + } + + + #region BaseMergePolicyTestCase + // LUCENENET NOTE: Tests in an abstract base class are not pulled into the correct + // context in Visual Studio. This fixes that with the minimum amount of code necessary + // to run them in the correct context without duplicating all of the tests. + + [Test] + public override void TestForceMergeNotNeeded() + { + base.TestForceMergeNotNeeded(); + } + + #endregion + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Index/TestTransactionRollback.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Index/TestTransactionRollback.cs b/src/Lucene.Net.Tests/Index/TestTransactionRollback.cs new file mode 100644 index 0000000..0ff979a --- /dev/null +++ b/src/Lucene.Net.Tests/Index/TestTransactionRollback.cs @@ -0,0 +1,271 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using Lucene.Net.Documents; + +namespace Lucene.Net.Index +{ + using Lucene.Net.Support; + using NUnit.Framework; + using IBits = Lucene.Net.Util.IBits; + using Directory = Lucene.Net.Store.Directory; + using Document = Documents.Document; + using Field = Field; + using LuceneTestCase = Lucene.Net.Util.LuceneTestCase; + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using MockAnalyzer = Lucene.Net.Analysis.MockAnalyzer; + + /// <summary> + /// Test class to illustrate using IndexDeletionPolicy to provide multi-level rollback capability. + /// this test case creates an index of records 1 to 100, introducing a commit point every 10 records. + /// + /// A "keep all" deletion policy is used to ensure we keep all commit points for testing purposes + /// </summary> + + [TestFixture] + public class TestTransactionRollback : LuceneTestCase + { + private const string FIELD_RECORD_ID = "record_id"; + private Directory Dir; + + //Rolls back index to a chosen ID + private void RollBackLast(int id) + { + // System.out.println("Attempting to rollback to "+id); + string ids = "-" + id; + IndexCommit last = null; + ICollection<IndexCommit> commits = DirectoryReader.ListCommits(Dir); + for (IEnumerator<IndexCommit> iterator = commits.GetEnumerator(); iterator.MoveNext(); ) + { + IndexCommit commit = iterator.Current; + IDictionary<string, string> ud = commit.UserData; + if (ud.Count > 0) + { + if (ud["index"].EndsWith(ids)) + { + last = commit; + } + } + } + + if (last == null) + { + throw new Exception("Couldn't find commit point " + id); + } + + IndexWriter w = new IndexWriter(Dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetIndexDeletionPolicy(new RollbackDeletionPolicy(this, id)).SetIndexCommit(last)); + IDictionary<string, string> data = new Dictionary<string, string>(); + data["index"] = "Rolled back to 1-" + id; + w.CommitData = data; + w.Dispose(); + } + + [Test] + public virtual void TestRepeatedRollBacks() + { + int expectedLastRecordId = 100; + while (expectedLastRecordId > 10) + { + expectedLastRecordId -= 10; + RollBackLast(expectedLastRecordId); + + BitArray expecteds = new BitArray(100); + expecteds.Set(1, (expectedLastRecordId + 1), true); + CheckExpecteds(expecteds); + } + } + + private void CheckExpecteds(BitArray expecteds) + { + IndexReader r = DirectoryReader.Open(Dir); + + //Perhaps not the most efficient approach but meets our + //needs here. + IBits liveDocs = MultiFields.GetLiveDocs(r); + for (int i = 0; i < r.MaxDoc; i++) + { + if (liveDocs == null || liveDocs.Get(i)) + { + string sval = r.Document(i).Get(FIELD_RECORD_ID); + if (sval != null) + { + int val = Convert.ToInt32(sval); + Assert.IsTrue(expecteds.SafeGet(val), "Did not expect document #" + val); + expecteds.SafeSet(val, false); + } + } + } + r.Dispose(); + Assert.AreEqual(0, expecteds.Cardinality(), "Should have 0 docs remaining "); + } + + /* + private void showAvailableCommitPoints() throws Exception { + Collection commits = DirectoryReader.ListCommits(dir); + for (Iterator iterator = commits.iterator(); iterator.hasNext();) { + IndexCommit comm = (IndexCommit) iterator.Next(); + System.out.print("\t Available commit point:["+comm.getUserData()+"] files="); + Collection files = comm.getFileNames(); + for (Iterator iterator2 = files.iterator(); iterator2.hasNext();) { + String filename = (String) iterator2.Next(); + System.out.print(filename+", "); + } + System.out.println(); + } + } + */ + + [SetUp] + public override void SetUp() + { + base.SetUp(); + Dir = NewDirectory(); + + //Build index, of records 1 to 100, committing after each batch of 10 + IndexDeletionPolicy sdp = new KeepAllDeletionPolicy(this); + IndexWriter w = new IndexWriter(Dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetIndexDeletionPolicy(sdp)); + + for (int currentRecordId = 1; currentRecordId <= 100; currentRecordId++) + { + Document doc = new Document(); + doc.Add(NewTextField(FIELD_RECORD_ID, "" + currentRecordId, Field.Store.YES)); + w.AddDocument(doc); + + if (currentRecordId % 10 == 0) + { + IDictionary<string, string> data = new Dictionary<string, string>(); + data["index"] = "records 1-" + currentRecordId; + w.CommitData = data; + w.Commit(); + } + } + + w.Dispose(); + } + + [TearDown] + public override void TearDown() + { + Dir.Dispose(); + base.TearDown(); + } + + // Rolls back to previous commit point + internal class RollbackDeletionPolicy : IndexDeletionPolicy + { + private readonly TestTransactionRollback OuterInstance; + + internal int RollbackPoint; + + public RollbackDeletionPolicy(TestTransactionRollback outerInstance, int rollbackPoint) + { + this.OuterInstance = outerInstance; + this.RollbackPoint = rollbackPoint; + } + + public override void OnCommit<T>(IList<T> commits) + { + } + + public override void OnInit<T>(IList<T> commits) + { + foreach (IndexCommit commit in commits) + { + IDictionary<string, string> userData = commit.UserData; + if (userData.Count > 0) + { + // Label for a commit point is "Records 1-30" + // this code reads the last id ("30" in this example) and deletes it + // if it is after the desired rollback point + string x = userData["index"]; + string lastVal = x.Substring(x.LastIndexOf("-") + 1); + int last = Convert.ToInt32(lastVal); + if (last > RollbackPoint) + { + /* + System.out.print("\tRolling back commit point:" + + " UserData="+commit.getUserData() +") ("+(commits.Size()-1)+" commit points left) files="); + Collection files = commit.getFileNames(); + for (Iterator iterator2 = files.iterator(); iterator2.hasNext();) { + System.out.print(" "+iterator2.Next()); + } + System.out.println(); + */ + + commit.Delete(); + } + } + } + } + } + + internal class DeleteLastCommitPolicy : IndexDeletionPolicy + { + private readonly TestTransactionRollback OuterInstance; + + public DeleteLastCommitPolicy(TestTransactionRollback outerInstance) + { + this.OuterInstance = outerInstance; + } + + public override void OnCommit<T>(IList<T> commits) + { + } + + public override void OnInit<T>(IList<T> commits) + { + commits.RemoveAt(commits.Count - 1); + } + } + + [Test] + public virtual void TestRollbackDeletionPolicy() + { + for (int i = 0; i < 2; i++) + { + // Unless you specify a prior commit point, rollback + // should not work: + (new IndexWriter(Dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetIndexDeletionPolicy(new DeleteLastCommitPolicy(this)))).Dispose(); + IndexReader r = DirectoryReader.Open(Dir); + Assert.AreEqual(100, r.NumDocs); + r.Dispose(); + } + } + + // Keeps all commit points (used to build index) + internal class KeepAllDeletionPolicy : IndexDeletionPolicy + { + private readonly TestTransactionRollback OuterInstance; + + public KeepAllDeletionPolicy(TestTransactionRollback outerInstance) + { + this.OuterInstance = outerInstance; + } + + public override void OnCommit<T>(IList<T> commits) + { + } + + public override void OnInit<T>(IList<T> commits) + { + } + } + } +} \ No newline at end of file
