http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Analysis/TestGraphTokenizers.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Analysis/TestGraphTokenizers.cs b/src/Lucene.Net.Tests/Analysis/TestGraphTokenizers.cs new file mode 100644 index 0000000..73619d1 --- /dev/null +++ b/src/Lucene.Net.Tests/Analysis/TestGraphTokenizers.cs @@ -0,0 +1,728 @@ +using Lucene.Net.Analysis.TokenAttributes; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Text; + +namespace Lucene.Net.Analysis +{ + using Lucene.Net.Support; + using NUnit.Framework; + using System.IO; + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using Automaton = Lucene.Net.Util.Automaton.Automaton; + using BasicAutomata = Lucene.Net.Util.Automaton.BasicAutomata; + using BasicOperations = Lucene.Net.Util.Automaton.BasicOperations; + + [TestFixture] + public class TestGraphTokenizers : BaseTokenStreamTestCase + { + // Makes a graph TokenStream from the string; separate + // positions with single space, multiple tokens at the same + // position with /, and add optional position length with + // :. EG "a b c" is a simple chain, "a/x b c" adds 'x' + // over 'a' at position 0 with posLen=1, "a/x:3 b c" adds + // 'x' over a with posLen=3. Tokens are in normal-form! + // So, offsets are computed based on the first token at a + // given position. NOTE: each token must be a single + // character! We assume this when computing offsets... + + // NOTE: all input tokens must be length 1!!! this means + // you cannot turn on MockCharFilter when random + // testing... + + private class GraphTokenizer : Tokenizer + { + internal IList<Token> Tokens; + internal int Upto; + internal int InputLength; + + internal readonly ICharTermAttribute TermAtt; + internal readonly IOffsetAttribute OffsetAtt; + internal readonly IPositionIncrementAttribute PosIncrAtt; + internal readonly IPositionLengthAttribute PosLengthAtt; + + public GraphTokenizer(TextReader input) + : base(input) + { + TermAtt = AddAttribute<ICharTermAttribute>(); + OffsetAtt = AddAttribute<IOffsetAttribute>(); + PosIncrAtt = AddAttribute<IPositionIncrementAttribute>(); + PosLengthAtt = AddAttribute<IPositionLengthAttribute>(); + } + + public override void Reset() + { + base.Reset(); + Tokens = null; + Upto = 0; + } + + public sealed override bool IncrementToken() + { + if (Tokens == null) + { + FillTokens(); + } + //System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size()); + if (Upto == Tokens.Count) + { + //System.out.println(" END @ " + tokens.size()); + return false; + } + Token t = Tokens[Upto++]; + //System.out.println(" return token=" + t); + ClearAttributes(); + TermAtt.Append(t.ToString()); + OffsetAtt.SetOffset(t.StartOffset, t.EndOffset); + PosIncrAtt.PositionIncrement = t.PositionIncrement; + PosLengthAtt.PositionLength = t.PositionLength; + return true; + } + + public override void End() + { + base.End(); + // NOTE: somewhat... hackish, but we need this to + // satisfy BTSTC: + int lastOffset; + if (Tokens != null && Tokens.Count > 0) + { + lastOffset = Tokens[Tokens.Count - 1].EndOffset; + } + else + { + lastOffset = 0; + } + OffsetAtt.SetOffset(CorrectOffset(lastOffset), CorrectOffset(InputLength)); + } + + internal virtual void FillTokens() + { + StringBuilder sb = new StringBuilder(); + char[] buffer = new char[256]; + while (true) + { + int count = m_input.Read(buffer, 0, buffer.Length); + + //.NET TextReader.Read(buff, int, int) returns 0, not -1 on no chars + // but in some cases, such as MockCharFilter, it overloads read and returns -1 + // so we should handle both 0 and -1 values + if (count <= 0) + { + break; + } + sb.Append(buffer, 0, count); + //System.out.println("got count=" + count); + } + //System.out.println("fillTokens: " + sb); + + InputLength = sb.Length; + + string[] parts = sb.ToString().Split(' '); + + Tokens = new List<Token>(); + int pos = 0; + int maxPos = -1; + int offset = 0; + //System.out.println("again"); + foreach (string part in parts) + { + string[] overlapped = part.Split('/'); + bool firstAtPos = true; + int minPosLength = int.MaxValue; + foreach (string part2 in overlapped) + { + int colonIndex = part2.IndexOf(':'); + string token; + int posLength; + if (colonIndex != -1) + { + token = part2.Substring(0, colonIndex); + posLength = Convert.ToInt32(part2.Substring(1 + colonIndex)); + } + else + { + token = part2; + posLength = 1; + } + maxPos = Math.Max(maxPos, pos + posLength); + minPosLength = Math.Min(minPosLength, posLength); + Token t = new Token(token, offset, offset + 2 * posLength - 1); + t.PositionLength = posLength; + t.PositionIncrement = firstAtPos ? 1 : 0; + firstAtPos = false; + //System.out.println(" add token=" + t + " startOff=" + t.StartOffset + " endOff=" + t.EndOffset); + Tokens.Add(t); + } + pos += minPosLength; + offset = 2 * pos; + } + Debug.Assert(maxPos <= pos, "input string mal-formed: posLength>1 tokens hang over the end"); + } + } + + [Test] + public virtual void TestMockGraphTokenFilterBasic() + { + for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) + { + if (VERBOSE) + { + Console.WriteLine("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + Analyzer a = new AnalyzerAnonymousInnerClassHelper(this); + + CheckAnalysisConsistency(Random(), a, false, "a b c d e f g h i j k"); + } + } + + private class AnalyzerAnonymousInnerClassHelper : Analyzer + { + private readonly TestGraphTokenizers OuterInstance; + + public AnalyzerAnonymousInnerClassHelper(TestGraphTokenizers outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream t2 = new MockGraphTokenFilter(Random(), t); + return new TokenStreamComponents(t, t2); + } + } + + [Test] + public virtual void TestMockGraphTokenFilterOnGraphInput() + { + for (int iter = 0; iter < 100 * RANDOM_MULTIPLIER; iter++) + { + if (VERBOSE) + { + Console.WriteLine("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this); + + CheckAnalysisConsistency(Random(), a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k"); + } + } + + private class AnalyzerAnonymousInnerClassHelper2 : Analyzer + { + private readonly TestGraphTokenizers OuterInstance; + + public AnalyzerAnonymousInnerClassHelper2(TestGraphTokenizers outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer t = new GraphTokenizer(reader); + TokenStream t2 = new MockGraphTokenFilter(Random(), t); + return new TokenStreamComponents(t, t2); + } + } + + // Just deletes (leaving hole) token 'a': + private sealed class RemoveATokens : TokenFilter + { + internal int PendingPosInc; + + internal readonly ICharTermAttribute TermAtt;// = addAttribute(typeof(CharTermAttribute)); + internal readonly IPositionIncrementAttribute PosIncAtt;// = addAttribute(typeof(PositionIncrementAttribute)); + + public RemoveATokens(TokenStream @in) + : base(@in) + { + TermAtt = AddAttribute<ICharTermAttribute>(); + PosIncAtt = AddAttribute<IPositionIncrementAttribute>(); + } + + public override void Reset() + { + base.Reset(); + PendingPosInc = 0; + } + + public override void End() + { + base.End(); + PosIncAtt.PositionIncrement = PendingPosInc + PosIncAtt.PositionIncrement; + } + + public override bool IncrementToken() + { + while (true) + { + bool gotOne = m_input.IncrementToken(); + if (!gotOne) + { + return false; + } + else if (TermAtt.ToString().Equals("a")) + { + PendingPosInc += PosIncAtt.PositionIncrement; + } + else + { + PosIncAtt.PositionIncrement = PendingPosInc + PosIncAtt.PositionIncrement; + PendingPosInc = 0; + return true; + } + } + } + } + + [Test] + public virtual void TestMockGraphTokenFilterBeforeHoles() + { + for (int iter = 0; iter < 100 * RANDOM_MULTIPLIER; iter++) + { + if (VERBOSE) + { + Console.WriteLine("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + Analyzer a = new MGTFBHAnalyzerAnonymousInnerClassHelper(this); + + Random random = Random(); + CheckAnalysisConsistency(random, a, false, "a b c d e f g h i j k"); + CheckAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k"); + CheckAnalysisConsistency(random, a, false, "a b c d e f g h i j k a"); + CheckAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y"); + } + } + + private class MGTFBHAnalyzerAnonymousInnerClassHelper : Analyzer + { + private readonly TestGraphTokenizers OuterInstance; + + public MGTFBHAnalyzerAnonymousInnerClassHelper(TestGraphTokenizers outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream t2 = new MockGraphTokenFilter(Random(), t); + TokenStream t3 = new RemoveATokens(t2); + return new TokenStreamComponents(t, t3); + } + } + + [Test] + public virtual void TestMockGraphTokenFilterAfterHoles() + { + for (int iter = 0; iter < 100 * RANDOM_MULTIPLIER; iter++) + { + if (VERBOSE) + { + Console.WriteLine("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + Analyzer a = new MGTFAHAnalyzerAnonymousInnerClassHelper2(this); + + Random random = Random(); + CheckAnalysisConsistency(random, a, false, "a b c d e f g h i j k"); + CheckAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k"); + CheckAnalysisConsistency(random, a, false, "a b c d e f g h i j k a"); + CheckAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y"); + } + } + + private class MGTFAHAnalyzerAnonymousInnerClassHelper2 : Analyzer + { + private readonly TestGraphTokenizers OuterInstance; + + public MGTFAHAnalyzerAnonymousInnerClassHelper2(TestGraphTokenizers outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream t2 = new RemoveATokens(t); + TokenStream t3 = new MockGraphTokenFilter(Random(), t2); + return new TokenStreamComponents(t, t3); + } + } + + [Test] + public virtual void TestMockGraphTokenFilterRandom() + { + for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) + { + if (VERBOSE) + { + Console.WriteLine("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + Analyzer a = new AnalyzerAnonymousInnerClassHelper3(this); + + Random random = Random(); + CheckRandomData(random, a, 5, AtLeast(100)); + } + } + + private class AnalyzerAnonymousInnerClassHelper3 : Analyzer + { + private readonly TestGraphTokenizers OuterInstance; + + public AnalyzerAnonymousInnerClassHelper3(TestGraphTokenizers outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream t2 = new MockGraphTokenFilter(Random(), t); + return new TokenStreamComponents(t, t2); + } + } + + // Two MockGraphTokenFilters + [Test] + public virtual void TestDoubleMockGraphTokenFilterRandom() + { + for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) + { + if (VERBOSE) + { + Console.WriteLine("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + Analyzer a = new AnalyzerAnonymousInnerClassHelper4(this); + + Random random = Random(); + CheckRandomData(random, a, 5, AtLeast(100)); + } + } + + [Test] + public void TestMockTokenizerCtor() + { + var sr = new StringReader("Hello"); + var mt = new MockTokenizer(sr); + } + + private class AnalyzerAnonymousInnerClassHelper4 : Analyzer + { + private readonly TestGraphTokenizers OuterInstance; + + public AnalyzerAnonymousInnerClassHelper4(TestGraphTokenizers outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream t1 = new MockGraphTokenFilter(Random(), t); + TokenStream t2 = new MockGraphTokenFilter(Random(), t1); + return new TokenStreamComponents(t, t2); + } + } + + [Test] + public virtual void TestMockGraphTokenFilterBeforeHolesRandom() + { + for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) + { + if (VERBOSE) + { + Console.WriteLine("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + Analyzer a = new AnalyzerAnonymousInnerClassHelper5(this); + + Random random = Random(); + CheckRandomData(random, a, 5, AtLeast(100)); + } + } + + private class AnalyzerAnonymousInnerClassHelper5 : Analyzer + { + private readonly TestGraphTokenizers OuterInstance; + + public AnalyzerAnonymousInnerClassHelper5(TestGraphTokenizers outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream t1 = new MockGraphTokenFilter(Random(), t); + TokenStream t2 = new MockHoleInjectingTokenFilter(Random(), t1); + return new TokenStreamComponents(t, t2); + } + } + + [Test] + public virtual void TestMockGraphTokenFilterAfterHolesRandom() + { + for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) + { + if (VERBOSE) + { + Console.WriteLine("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + Analyzer a = new AnalyzerAnonymousInnerClassHelper6(this); + + Random random = Random(); + CheckRandomData(random, a, 5, AtLeast(100)); + } + } + + private class AnalyzerAnonymousInnerClassHelper6 : Analyzer + { + private readonly TestGraphTokenizers OuterInstance; + + public AnalyzerAnonymousInnerClassHelper6(TestGraphTokenizers outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream t1 = new MockHoleInjectingTokenFilter(Random(), t); + TokenStream t2 = new MockGraphTokenFilter(Random(), t1); + return new TokenStreamComponents(t, t2); + } + } + + private static Token Token(string term, int posInc, int posLength) + { + Token t = new Token(term, 0, 0); + t.PositionIncrement = posInc; + t.PositionLength = posLength; + return t; + } + + private static Token Token(string term, int posInc, int posLength, int startOffset, int endOffset) + { + Token t = new Token(term, startOffset, endOffset); + t.PositionIncrement = posInc; + t.PositionLength = posLength; + return t; + } + + [Test] + public virtual void TestSingleToken() + { + TokenStream ts = new CannedTokenStream(new Token[] { Token("abc", 1, 1) }); + Automaton actual = (new TokenStreamToAutomaton()).ToAutomaton(ts); + Automaton expected = BasicAutomata.MakeString("abc"); + Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); + } + + [Test] + public virtual void TestMultipleHoles() + { + TokenStream ts = new CannedTokenStream(new Token[] { Token("a", 1, 1), Token("b", 3, 1) }); + Automaton actual = (new TokenStreamToAutomaton()).ToAutomaton(ts); + Automaton expected = Join(S2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, S2a("b")); + Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); + } + + [Test] + public virtual void TestSynOverMultipleHoles() + { + TokenStream ts = new CannedTokenStream(new Token[] { Token("a", 1, 1), Token("x", 0, 3), Token("b", 3, 1) }); + Automaton actual = (new TokenStreamToAutomaton()).ToAutomaton(ts); + Automaton a1 = Join(S2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, S2a("b")); + Automaton a2 = Join(S2a("x"), SEP_A, S2a("b")); + Automaton expected = BasicOperations.Union(a1, a2); + Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); + } + + // for debugging! + /* + private static void toDot(Automaton a) throws IOException { + final String s = a.toDot(); + Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); + w.write(s); + w.close(); + System.out.println("TEST: saved to /x/tmp/out.dot"); + } + */ + + private static readonly Automaton SEP_A = BasicAutomata.MakeChar(TokenStreamToAutomaton.POS_SEP); + private static readonly Automaton HOLE_A = BasicAutomata.MakeChar(TokenStreamToAutomaton.HOLE); + + private Automaton Join(params string[] strings) + { + IList<Automaton> @as = new List<Automaton>(); + foreach (string s in strings) + { + @as.Add(BasicAutomata.MakeString(s)); + @as.Add(SEP_A); + } + @as.RemoveAt(@as.Count - 1); + return BasicOperations.Concatenate(@as); + } + + private Automaton Join(params Automaton[] @as) + { + return BasicOperations.Concatenate(Arrays.AsList(@as)); + } + + private Automaton S2a(string s) + { + return BasicAutomata.MakeString(s); + } + + [Test] + public virtual void TestTwoTokens() + { + TokenStream ts = new CannedTokenStream(new Token[] { Token("abc", 1, 1), Token("def", 1, 1) }); + Automaton actual = (new TokenStreamToAutomaton()).ToAutomaton(ts); + Automaton expected = Join("abc", "def"); + + //toDot(actual); + Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); + } + + [Test] + public virtual void TestHole() + { + TokenStream ts = new CannedTokenStream(new Token[] { Token("abc", 1, 1), Token("def", 2, 1) }); + Automaton actual = (new TokenStreamToAutomaton()).ToAutomaton(ts); + + Automaton expected = Join(S2a("abc"), SEP_A, HOLE_A, SEP_A, S2a("def")); + + //toDot(actual); + Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); + } + + [Test] + public virtual void TestOverlappedTokensSausage() + { + // Two tokens on top of each other (sausage): + TokenStream ts = new CannedTokenStream(new Token[] { Token("abc", 1, 1), Token("xyz", 0, 1) }); + Automaton actual = (new TokenStreamToAutomaton()).ToAutomaton(ts); + Automaton a1 = BasicAutomata.MakeString("abc"); + Automaton a2 = BasicAutomata.MakeString("xyz"); + Automaton expected = BasicOperations.Union(a1, a2); + Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); + } + + [Test] + public virtual void TestOverlappedTokensLattice() + { + TokenStream ts = new CannedTokenStream(new Token[] { Token("abc", 1, 1), Token("xyz", 0, 2), Token("def", 1, 1) }); + Automaton actual = (new TokenStreamToAutomaton()).ToAutomaton(ts); + Automaton a1 = BasicAutomata.MakeString("xyz"); + Automaton a2 = Join("abc", "def"); + + Automaton expected = BasicOperations.Union(a1, a2); + //toDot(actual); + Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); + } + + [Test] + public virtual void TestSynOverHole() + { + TokenStream ts = new CannedTokenStream(new Token[] { Token("a", 1, 1), Token("X", 0, 2), Token("b", 2, 1) }); + Automaton actual = (new TokenStreamToAutomaton()).ToAutomaton(ts); + Automaton a1 = BasicOperations.Union(Join(S2a("a"), SEP_A, HOLE_A), BasicAutomata.MakeString("X")); + Automaton expected = BasicOperations.Concatenate(a1, Join(SEP_A, S2a("b"))); + //toDot(actual); + Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); + } + + [Test] + public virtual void TestSynOverHole2() + { + TokenStream ts = new CannedTokenStream(new Token[] { Token("xyz", 1, 1), Token("abc", 0, 3), Token("def", 2, 1) }); + Automaton actual = (new TokenStreamToAutomaton()).ToAutomaton(ts); + Automaton expected = BasicOperations.Union(Join(S2a("xyz"), SEP_A, HOLE_A, SEP_A, S2a("def")), BasicAutomata.MakeString("abc")); + Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); + } + + [Test] + public virtual void TestOverlappedTokensLattice2() + { + TokenStream ts = new CannedTokenStream(new Token[] { Token("abc", 1, 1), Token("xyz", 0, 3), Token("def", 1, 1), Token("ghi", 1, 1) }); + Automaton actual = (new TokenStreamToAutomaton()).ToAutomaton(ts); + Automaton a1 = BasicAutomata.MakeString("xyz"); + Automaton a2 = Join("abc", "def", "ghi"); + Automaton expected = BasicOperations.Union(a1, a2); + //toDot(actual); + Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); + } + + [Test] + public virtual void TestToDot() + { + TokenStream ts = new CannedTokenStream(new Token[] { Token("abc", 1, 1, 0, 4) }); + StringWriter w = new StringWriter(); + (new TokenStreamToDot("abcd", ts, (TextWriter)(w))).ToDot(); + Assert.IsTrue(w.ToString().IndexOf("abc / abcd") != -1); + } + + [Test] + public virtual void TestStartsWithHole() + { + TokenStream ts = new CannedTokenStream(new Token[] { Token("abc", 2, 1) }); + Automaton actual = (new TokenStreamToAutomaton()).ToAutomaton(ts); + Automaton expected = Join(HOLE_A, SEP_A, S2a("abc")); + //toDot(actual); + Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); + } + + // TODO: testEndsWithHole... but we need posInc to set in TS.end() + + [Test] + public virtual void TestSynHangingOverEnd() + { + TokenStream ts = new CannedTokenStream(new Token[] { Token("a", 1, 1), Token("X", 0, 10) }); + Automaton actual = (new TokenStreamToAutomaton()).ToAutomaton(ts); + Automaton expected = BasicOperations.Union(BasicAutomata.MakeString("a"), BasicAutomata.MakeString("X")); + Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); + } + } +} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Analysis/TestLookaheadTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Analysis/TestLookaheadTokenFilter.cs b/src/Lucene.Net.Tests/Analysis/TestLookaheadTokenFilter.cs new file mode 100644 index 0000000..ee733bd --- /dev/null +++ b/src/Lucene.Net.Tests/Analysis/TestLookaheadTokenFilter.cs @@ -0,0 +1,129 @@ +using Lucene.Net.Attributes; +using Lucene.Net.Randomized.Generators; +using NUnit.Framework; +using System; +using System.IO; + +namespace Lucene.Net.Analysis +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + [TestFixture] + public class TestLookaheadTokenFilter : BaseTokenStreamTestCase + { +#if !NETSTANDARD + // LUCENENET: There is no Timeout on NUnit for .NET Core. + [Timeout(int.MaxValue)] +#endif + [Test, LongRunningTest, HasTimeout] + public virtual void TestRandomStrings() + { + Analyzer a = new AnalyzerAnonymousInnerClassHelper(this); + CheckRandomData(Random(), a, 200 * RANDOM_MULTIPLIER, 8192); + } + + private class AnalyzerAnonymousInnerClassHelper : Analyzer + { + private readonly TestLookaheadTokenFilter OuterInstance; + + public AnalyzerAnonymousInnerClassHelper(TestLookaheadTokenFilter outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Random random = Random(); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, random.NextBoolean()); + TokenStream output = new MockRandomLookaheadTokenFilter(random, tokenizer); + return new TokenStreamComponents(tokenizer, output); + } + } + + private class NeverPeeksLookaheadTokenFilter : LookaheadTokenFilter<LookaheadTokenFilter.Position> + { + public NeverPeeksLookaheadTokenFilter(TokenStream input) + : base(input) + { + } + + protected internal override LookaheadTokenFilter.Position NewPosition() + { + return new LookaheadTokenFilter.Position(); + } + + public sealed override bool IncrementToken() + { + return NextToken(); + } + } + +#if !NETSTANDARD + // LUCENENET: There is no Timeout on NUnit for .NET Core. + [Timeout(int.MaxValue)] +#endif + [Test, LongRunningTest, HasTimeout] + public virtual void TestNeverCallingPeek() + { + Analyzer a = new NCPAnalyzerAnonymousInnerClassHelper(this); + CheckRandomData(Random(), a, 200 * RANDOM_MULTIPLIER, 8192); + } + + private class NCPAnalyzerAnonymousInnerClassHelper : Analyzer + { + private readonly TestLookaheadTokenFilter OuterInstance; + + public NCPAnalyzerAnonymousInnerClassHelper(TestLookaheadTokenFilter outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, Random().NextBoolean()); + TokenStream output = new NeverPeeksLookaheadTokenFilter(tokenizer); + return new TokenStreamComponents(tokenizer, output); + } + } + + [Test] + public virtual void TestMissedFirstToken() + { + Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper2(this); + + AssertAnalyzesTo(analyzer, "Only he who is running knows .", new string[] { "Only", "Only-huh?", "he", "he-huh?", "who", "who-huh?", "is", "is-huh?", "running", "running-huh?", "knows", "knows-huh?", ".", ".-huh?" }); + } + + private class AnalyzerAnonymousInnerClassHelper2 : Analyzer + { + private readonly TestLookaheadTokenFilter OuterInstance; + + public AnalyzerAnonymousInnerClassHelper2(TestLookaheadTokenFilter outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TrivialLookaheadFilter filter = new TrivialLookaheadFilter(source); + return new TokenStreamComponents(source, filter); + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Analysis/TestMockAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Analysis/TestMockAnalyzer.cs b/src/Lucene.Net.Tests/Analysis/TestMockAnalyzer.cs new file mode 100644 index 0000000..cd8f315 --- /dev/null +++ b/src/Lucene.Net.Tests/Analysis/TestMockAnalyzer.cs @@ -0,0 +1,420 @@ +using System; +using Lucene.Net.Attributes; +using Lucene.Net.Documents; +using Lucene.Net.Index; + +namespace Lucene.Net.Analysis +{ + using Lucene.Net.Randomized.Generators; + using Lucene.Net.Support; + using NUnit.Framework; + using System.IO; + using AtomicReader = Lucene.Net.Index.AtomicReader; + using Automaton = Lucene.Net.Util.Automaton.Automaton; + using AutomatonTestUtil = Lucene.Net.Util.Automaton.AutomatonTestUtil; + using BasicAutomata = Lucene.Net.Util.Automaton.BasicAutomata; + using BasicOperations = Lucene.Net.Util.Automaton.BasicOperations; + using BytesRef = Lucene.Net.Util.BytesRef; + using CharacterRunAutomaton = Lucene.Net.Util.Automaton.CharacterRunAutomaton; + using DocsAndPositionsEnum = Lucene.Net.Index.DocsAndPositionsEnum; + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using Document = Documents.Document; + using Field = Field; + using Fields = Lucene.Net.Index.Fields; + using FieldType = FieldType; + using IOUtils = Lucene.Net.Util.IOUtils; + using RandomIndexWriter = Lucene.Net.Index.RandomIndexWriter; + using RegExp = Lucene.Net.Util.Automaton.RegExp; + using Terms = Lucene.Net.Index.Terms; + using TermsEnum = Lucene.Net.Index.TermsEnum; + using TestUtil = Lucene.Net.Util.TestUtil; + + [TestFixture] + public class TestMockAnalyzer : BaseTokenStreamTestCase + { + /// <summary> + /// Test a configuration that behaves a lot like WhitespaceAnalyzer </summary> + [Test] + public virtual void TestWhitespace() + { + Analyzer a = new MockAnalyzer(Random()); + AssertAnalyzesTo(a, "A bc defg hiJklmn opqrstuv wxy z ", new string[] { "a", "bc", "defg", "hijklmn", "opqrstuv", "wxy", "z" }); + AssertAnalyzesTo(a, "aba cadaba shazam", new string[] { "aba", "cadaba", "shazam" }); + AssertAnalyzesTo(a, "break on whitespace", new string[] { "break", "on", "whitespace" }); + } + + /// <summary> + /// Test a configuration that behaves a lot like SimpleAnalyzer </summary> + [Test] + public virtual void TestSimple() + { + Analyzer a = new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true); + AssertAnalyzesTo(a, "a-bc123 defg+hijklmn567opqrstuv78wxy_z ", new string[] { "a", "bc", "defg", "hijklmn", "opqrstuv", "wxy", "z" }); + AssertAnalyzesTo(a, "aba4cadaba-Shazam", new string[] { "aba", "cadaba", "shazam" }); + AssertAnalyzesTo(a, "break+on/Letters", new string[] { "break", "on", "letters" }); + } + + /// <summary> + /// Test a configuration that behaves a lot like KeywordAnalyzer </summary> + [Test] + public virtual void TestKeyword() + { + Analyzer a = new MockAnalyzer(Random(), MockTokenizer.KEYWORD, false); + AssertAnalyzesTo(a, "a-bc123 defg+hijklmn567opqrstuv78wxy_z ", new string[] { "a-bc123 defg+hijklmn567opqrstuv78wxy_z " }); + AssertAnalyzesTo(a, "aba4cadaba-Shazam", new string[] { "aba4cadaba-Shazam" }); + AssertAnalyzesTo(a, "break+on/Nothing", new string[] { "break+on/Nothing" }); + // currently though emits no tokens for empty string: maybe we can do it, + // but we don't want to emit tokens infinitely... + AssertAnalyzesTo(a, "", new string[0]); + } + + // Test some regular expressions as tokenization patterns + /// <summary> + /// Test a configuration where each character is a term </summary> + [Test] + public virtual void TestSingleChar() + { + var single = new CharacterRunAutomaton((new RegExp(".")).ToAutomaton()); + Analyzer a = new MockAnalyzer(Random(), single, false); + AssertAnalyzesTo(a, "foobar", new[] { "f", "o", "o", "b", "a", "r" }, new[] { 0, 1, 2, 3, 4, 5 }, new[] { 1, 2, 3, 4, 5, 6 }); + CheckRandomData(Random(), a, 100); + } + + /// <summary> + /// Test a configuration where two characters makes a term </summary> + [Test] + public virtual void TestTwoChars() + { + CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("..")).ToAutomaton()); + Analyzer a = new MockAnalyzer(Random(), single, false); + AssertAnalyzesTo(a, "foobar", new string[] { "fo", "ob", "ar" }, new int[] { 0, 2, 4 }, new int[] { 2, 4, 6 }); + // make sure when last term is a "partial" match that End() is correct + AssertTokenStreamContents(a.TokenStream("bogus", new StringReader("fooba")), new string[] { "fo", "ob" }, new int[] { 0, 2 }, new int[] { 2, 4 }, new int[] { 1, 1 }, new int?(5)); + CheckRandomData(Random(), a, 100); + } + + /// <summary> + /// Test a configuration where three characters makes a term </summary> + [Test] + public virtual void TestThreeChars() + { + CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("...")).ToAutomaton()); + Analyzer a = new MockAnalyzer(Random(), single, false); + AssertAnalyzesTo(a, "foobar", new string[] { "foo", "bar" }, new int[] { 0, 3 }, new int[] { 3, 6 }); + // make sure when last term is a "partial" match that End() is correct + AssertTokenStreamContents(a.TokenStream("bogus", new StringReader("fooba")), new string[] { "foo" }, new int[] { 0 }, new int[] { 3 }, new int[] { 1 }, new int?(5)); + CheckRandomData(Random(), a, 100); + } + + /// <summary> + /// Test a configuration where word starts with one uppercase </summary> + [Test] + public virtual void TestUppercase() + { + CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("[A-Z][a-z]*")).ToAutomaton()); + Analyzer a = new MockAnalyzer(Random(), single, false); + AssertAnalyzesTo(a, "FooBarBAZ", new string[] { "Foo", "Bar", "B", "A", "Z" }, new int[] { 0, 3, 6, 7, 8 }, new int[] { 3, 6, 7, 8, 9 }); + AssertAnalyzesTo(a, "aFooBar", new string[] { "Foo", "Bar" }, new int[] { 1, 4 }, new int[] { 4, 7 }); + CheckRandomData(Random(), a, 100); + } + + /// <summary> + /// Test a configuration that behaves a lot like StopAnalyzer </summary> + [Test] + public virtual void TestStop() + { + Analyzer a = new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); + AssertAnalyzesTo(a, "the quick brown a fox", new string[] { "quick", "brown", "fox" }, new int[] { 2, 1, 2 }); + } + + /// <summary> + /// Test a configuration that behaves a lot like KeepWordFilter </summary> + [Test] + public virtual void TestKeep() + { + CharacterRunAutomaton keepWords = new CharacterRunAutomaton(BasicOperations.Complement(Automaton.Union(Arrays.AsList(BasicAutomata.MakeString("foo"), BasicAutomata.MakeString("bar"))))); + Analyzer a = new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true, keepWords); + AssertAnalyzesTo(a, "quick foo brown bar bar fox foo", new string[] { "foo", "bar", "bar", "foo" }, new int[] { 2, 2, 1, 2 }); + } + + /// <summary> + /// Test a configuration that behaves a lot like LengthFilter </summary> + [Test] + public virtual void TestLength() + { + CharacterRunAutomaton length5 = new CharacterRunAutomaton((new RegExp(".{5,}")).ToAutomaton()); + Analyzer a = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, true, length5); + AssertAnalyzesTo(a, "ok toolong fine notfine", new string[] { "ok", "fine" }, new int[] { 1, 2 }); + } + + /// <summary> + /// Test MockTokenizer encountering a too long token </summary> + [Test] + public virtual void TestTooLongToken() + { + Analyzer whitespace = new AnalyzerAnonymousInnerClassHelper(this); + AssertTokenStreamContents(whitespace.TokenStream("bogus", new StringReader("test 123 toolong ok ")), new string[] { "test", "123", "toolo", "ng", "ok" }, new int[] { 0, 5, 9, 14, 17 }, new int[] { 4, 8, 14, 16, 19 }, new int?(20)); + AssertTokenStreamContents(whitespace.TokenStream("bogus", new StringReader("test 123 toolo")), new string[] { "test", "123", "toolo" }, new int[] { 0, 5, 9 }, new int[] { 4, 8, 14 }, new int?(14)); + } + + private class AnalyzerAnonymousInnerClassHelper : Analyzer + { + private readonly TestMockAnalyzer OuterInstance; + + public AnalyzerAnonymousInnerClassHelper(TestMockAnalyzer outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, 5); + return new TokenStreamComponents(t, t); + } + } + + [Test] + public virtual void TestLUCENE_3042() + { + string testString = "t"; + + Analyzer analyzer = new MockAnalyzer(Random()); + Exception priorException = null; + TokenStream stream = analyzer.TokenStream("dummy", new StringReader(testString)); + try + { + stream.Reset(); + while (stream.IncrementToken()) + { + // consume + } + stream.End(); + } + catch (Exception e) + { + priorException = e; + } + finally + { + IOUtils.CloseWhileHandlingException(priorException, stream); + } + + AssertAnalyzesTo(analyzer, testString, new string[] { "t" }); + } + + /// <summary> + /// blast some random strings through the analyzer </summary> + [Test] + public virtual void TestRandomStrings() + { + CheckRandomData(Random(), new MockAnalyzer(Random()), AtLeast(1000)); + } + + /// <summary> + /// blast some random strings through differently configured tokenizers </summary> +#if !NETSTANDARD + // LUCENENET: There is no Timeout on NUnit for .NET Core. + [Timeout(int.MaxValue)] +#endif + [Test, LongRunningTest, HasTimeout] + public virtual void TestRandomRegexps() + { + int iters = AtLeast(30); + for (int i = 0; i < iters; i++) + { + CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.RandomAutomaton(Random())); + bool lowercase = Random().NextBoolean(); + int limit = TestUtil.NextInt(Random(), 0, 500); + Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, dfa, lowercase, limit); + CheckRandomData(Random(), a, 100); + a.Dispose(); + } + } + + private class AnalyzerAnonymousInnerClassHelper2 : Analyzer + { + private readonly TestMockAnalyzer OuterInstance; + + private CharacterRunAutomaton Dfa; + private bool Lowercase; + private int Limit; + + public AnalyzerAnonymousInnerClassHelper2(TestMockAnalyzer outerInstance, CharacterRunAutomaton dfa, bool lowercase, int limit) + { + this.OuterInstance = outerInstance; + this.Dfa = dfa; + this.Lowercase = lowercase; + this.Limit = limit; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer t = new MockTokenizer(reader, Dfa, Lowercase, Limit); + return new TokenStreamComponents(t, t); + } + } + + [Test] + public virtual void TestForwardOffsets() + { + int num = AtLeast(10000); + for (int i = 0; i < num; i++) + { + string s = TestUtil.RandomHtmlishString(Random(), 20); + StringReader reader = new StringReader(s); + MockCharFilter charfilter = new MockCharFilter(reader, 2); + MockAnalyzer analyzer = new MockAnalyzer(Random()); + Exception priorException = null; + TokenStream ts = analyzer.TokenStream("bogus", charfilter.m_input); + try + { + ts.Reset(); + while (ts.IncrementToken()) + { + ; + } + ts.End(); + } + catch (Exception e) + { + priorException = e; + } + finally + { + IOUtils.CloseWhileHandlingException(priorException, ts); + } + } + } + + [Test] + public virtual void TestWrapReader() + { + // LUCENE-5153: test that wrapping an analyzer's reader is allowed + Random random = Random(); + + Analyzer @delegate = new MockAnalyzer(random); + Analyzer a = new AnalyzerWrapperAnonymousInnerClassHelper(this, @delegate.Strategy, @delegate); + + CheckOneTerm(a, "abc", "aabc"); + } + + private class AnalyzerWrapperAnonymousInnerClassHelper : AnalyzerWrapper + { + private readonly TestMockAnalyzer OuterInstance; + + private Analyzer @delegate; + + public AnalyzerWrapperAnonymousInnerClassHelper(TestMockAnalyzer outerInstance, ReuseStrategy getReuseStrategy, Analyzer @delegate) + : base(getReuseStrategy) + { + this.OuterInstance = outerInstance; + this.@delegate = @delegate; + } + + protected override TextReader WrapReader(string fieldName, TextReader reader) + { + return new MockCharFilter(reader, 7); + } + + protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components) + { + return components; + } + + protected override Analyzer GetWrappedAnalyzer(string fieldName) + { + return @delegate; + } + } + + [Test] + public virtual void TestChangeGaps() + { + // LUCENE-5324: check that it is possible to change the wrapper's gaps + int positionGap = Random().Next(1000); + int offsetGap = Random().Next(1000); + Analyzer @delegate = new MockAnalyzer(Random()); + Analyzer a = new AnalyzerWrapperAnonymousInnerClassHelper2(this, @delegate.Strategy, positionGap, offsetGap, @delegate); + + RandomIndexWriter writer = new RandomIndexWriter(Random(), NewDirectory(), Similarity, TimeZone); + Document doc = new Document(); + FieldType ft = new FieldType(); + ft.IsIndexed = true; + ft.IndexOptions = IndexOptions.DOCS_ONLY; + ft.IsTokenized = true; + ft.StoreTermVectors = true; + ft.StoreTermVectorPositions = true; + ft.StoreTermVectorOffsets = true; + doc.Add(new Field("f", "a", ft)); + doc.Add(new Field("f", "a", ft)); + writer.AddDocument(doc, a); + AtomicReader reader = GetOnlySegmentReader(writer.Reader); + Fields fields = reader.GetTermVectors(0); + Terms terms = fields.GetTerms("f"); + TermsEnum te = terms.GetIterator(null); + Assert.AreEqual(new BytesRef("a"), te.Next()); + DocsAndPositionsEnum dpe = te.DocsAndPositions(null, null); + Assert.AreEqual(0, dpe.NextDoc()); + Assert.AreEqual(2, dpe.Freq); + Assert.AreEqual(0, dpe.NextPosition()); + Assert.AreEqual(0, dpe.StartOffset); + int endOffset = dpe.EndOffset; + Assert.AreEqual(1 + positionGap, dpe.NextPosition()); + Assert.AreEqual(1 + endOffset + offsetGap, dpe.EndOffset); + Assert.AreEqual(null, te.Next()); + reader.Dispose(); + writer.Dispose(); + writer.w.Directory.Dispose(); + } + + private class AnalyzerWrapperAnonymousInnerClassHelper2 : AnalyzerWrapper + { + private readonly TestMockAnalyzer OuterInstance; + + private int PositionGap; + private int OffsetGap; + private Analyzer @delegate; + + public AnalyzerWrapperAnonymousInnerClassHelper2(TestMockAnalyzer outerInstance, ReuseStrategy getReuseStrategy, int positionGap, int offsetGap, Analyzer @delegate) + : base(getReuseStrategy) + { + this.OuterInstance = outerInstance; + this.PositionGap = positionGap; + this.OffsetGap = offsetGap; + this.@delegate = @delegate; + } + + protected override Analyzer GetWrappedAnalyzer(string fieldName) + { + return @delegate; + } + + public override int GetPositionIncrementGap(string fieldName) + { + return PositionGap; + } + + public override int GetOffsetGap(string fieldName) + { + return OffsetGap; + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Analysis/TestMockCharFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Analysis/TestMockCharFilter.cs b/src/Lucene.Net.Tests/Analysis/TestMockCharFilter.cs new file mode 100644 index 0000000..75fc8cd --- /dev/null +++ b/src/Lucene.Net.Tests/Analysis/TestMockCharFilter.cs @@ -0,0 +1,59 @@ +using NUnit.Framework; +using System.IO; + +namespace Lucene.Net.Analysis +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + [TestFixture] + public class TestMockCharFilter : BaseTokenStreamTestCase + { + [Test] + public virtual void Test() + { + Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this); + + AssertAnalyzesTo(analyzer, "aab", new string[] { "aab" }, new int[] { 0 }, new int[] { 3 }); + + AssertAnalyzesTo(analyzer, "aabaa", new string[] { "aabaa" }, new int[] { 0 }, new int[] { 5 }); + + AssertAnalyzesTo(analyzer, "aabcdefgaa", new string[] { "aabcdefgaa" }, new int[] { 0 }, new int[] { 10 }); + } + + private class AnalyzerAnonymousInnerClassHelper : Analyzer + { + private readonly TestMockCharFilter OuterInstance; + + public AnalyzerAnonymousInnerClassHelper(TestMockCharFilter outerInstance) + { + this.OuterInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, tokenizer); + } + + protected internal override TextReader InitReader(string fieldName, TextReader reader) + { + return new MockCharFilter(reader, 7); + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Analysis/TestNumericTokenStream.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Analysis/TestNumericTokenStream.cs b/src/Lucene.Net.Tests/Analysis/TestNumericTokenStream.cs new file mode 100644 index 0000000..8c020e4 --- /dev/null +++ b/src/Lucene.Net.Tests/Analysis/TestNumericTokenStream.cs @@ -0,0 +1,144 @@ +using Lucene.Net.Analysis.TokenAttributes; + +namespace Lucene.Net.Analysis +{ + using NUnit.Framework; + using System; + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using BytesRef = Lucene.Net.Util.BytesRef; + using CharTermAttribute = Lucene.Net.Analysis.TokenAttributes.CharTermAttribute; + using ICharTermAttribute = Lucene.Net.Analysis.TokenAttributes.ICharTermAttribute; + using NumericUtils = Lucene.Net.Util.NumericUtils; + + [TestFixture] + public class TestNumericTokenStream : BaseTokenStreamTestCase + { + internal const long Lvalue = 4573245871874382L; + internal const int Ivalue = 123456; + + [NUnit.Framework.Test] + public virtual void TestLongStream() + { + using (NumericTokenStream stream = (new NumericTokenStream()).SetInt64Value(Lvalue)) { + // use getAttribute to test if attributes really exist, if not an IAE will be throwed + ITermToBytesRefAttribute bytesAtt = stream.GetAttribute<ITermToBytesRefAttribute>(); + ITypeAttribute typeAtt = stream.GetAttribute<ITypeAttribute>(); + NumericTokenStream.INumericTermAttribute numericAtt = stream.GetAttribute<NumericTokenStream.INumericTermAttribute>(); + BytesRef bytes = bytesAtt.BytesRef; + stream.Reset(); + Assert.AreEqual(64, numericAtt.ValueSize); + for (int shift = 0; shift < 64; shift += NumericUtils.PRECISION_STEP_DEFAULT) + { + Assert.IsTrue(stream.IncrementToken(), "New token is available"); + Assert.AreEqual(shift, numericAtt.Shift, "Shift value wrong"); + bytesAtt.FillBytesRef(); + Assert.AreEqual(Lvalue & ~((1L << shift) - 1L), NumericUtils.PrefixCodedToInt64(bytes), "Term is incorrectly encoded"); + Assert.AreEqual(Lvalue & ~((1L << shift) - 1L), numericAtt.RawValue, "Term raw value is incorrectly encoded"); + Assert.AreEqual((shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.Type, "Type incorrect"); + } + Assert.IsFalse(stream.IncrementToken(), "More tokens available"); + stream.End(); + } + } + + [NUnit.Framework.Test] + public virtual void TestIntStream() + { + NumericTokenStream stream = (new NumericTokenStream()).SetInt32Value(Ivalue); + // use getAttribute to test if attributes really exist, if not an IAE will be throwed + ITermToBytesRefAttribute bytesAtt = stream.GetAttribute<ITermToBytesRefAttribute>(); + ITypeAttribute typeAtt = stream.GetAttribute<ITypeAttribute>(); + NumericTokenStream.INumericTermAttribute numericAtt = stream.GetAttribute<NumericTokenStream.INumericTermAttribute>(); + BytesRef bytes = bytesAtt.BytesRef; + stream.Reset(); + Assert.AreEqual(32, numericAtt.ValueSize); + for (int shift = 0; shift < 32; shift += NumericUtils.PRECISION_STEP_DEFAULT) + { + Assert.IsTrue(stream.IncrementToken(), "New token is available"); + Assert.AreEqual(shift, numericAtt.Shift, "Shift value wrong"); + bytesAtt.FillBytesRef(); + Assert.AreEqual(Ivalue & ~((1 << shift) - 1), NumericUtils.PrefixCodedToInt32(bytes), "Term is incorrectly encoded"); + Assert.AreEqual(((long)Ivalue) & ~((1L << shift) - 1L), numericAtt.RawValue, "Term raw value is incorrectly encoded"); + Assert.AreEqual((shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.Type, "Type incorrect"); + } + Assert.IsFalse(stream.IncrementToken(), "More tokens available"); + stream.End(); + stream.Dispose(); + } + + [NUnit.Framework.Test] + public virtual void TestNotInitialized() + { + NumericTokenStream stream = new NumericTokenStream(); + + try + { + stream.Reset(); + Assert.Fail("reset() should not succeed."); + } + catch (Exception) + { + // pass + } + + try + { + stream.IncrementToken(); + Assert.Fail("IncrementToken() should not succeed."); + } + catch (Exception) + { + // pass + } + } + + public interface ITestAttribute : ICharTermAttribute + { + } + + public class TestAttribute : CharTermAttribute, ITestAttribute + { + } + + [NUnit.Framework.Test] + public virtual void TestCTA() + { + NumericTokenStream stream = new NumericTokenStream(); + try + { + stream.AddAttribute<ICharTermAttribute>(); + Assert.Fail("Succeeded to add CharTermAttribute."); + } + catch (System.ArgumentException iae) + { + Assert.IsTrue(iae.Message.StartsWith("NumericTokenStream does not support")); + } + try + { + stream.AddAttribute<ITestAttribute>(); + Assert.Fail("Succeeded to add TestAttribute."); + } + catch (System.ArgumentException iae) + { + Assert.IsTrue(iae.Message.StartsWith("NumericTokenStream does not support")); + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Analysis/TestPosition.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Analysis/TestPosition.cs b/src/Lucene.Net.Tests/Analysis/TestPosition.cs new file mode 100644 index 0000000..54eb089 --- /dev/null +++ b/src/Lucene.Net.Tests/Analysis/TestPosition.cs @@ -0,0 +1,27 @@ +namespace Lucene.Net.Analysis +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Trivial position class. + /// </summary> + public class TestPosition : LookaheadTokenFilter.Position + { + public virtual string Fact { get; set; } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Analysis/TestReusableStringReader.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Analysis/TestReusableStringReader.cs b/src/Lucene.Net.Tests/Analysis/TestReusableStringReader.cs new file mode 100644 index 0000000..150a3f8 --- /dev/null +++ b/src/Lucene.Net.Tests/Analysis/TestReusableStringReader.cs @@ -0,0 +1,71 @@ +using System.Text; + +namespace Lucene.Net.Analysis +{ + using NUnit.Framework; + using LuceneTestCase = Lucene.Net.Util.LuceneTestCase; + + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + [TestFixture] + public class TestReusableStringReader : LuceneTestCase + { + [Test] + public virtual void Test() + { + char[] buf = new char[4]; + + using (ReusableStringReader reader = new ReusableStringReader()) + { + Assert.AreEqual(-1, reader.Read()); + Assert.AreEqual(-1, reader.Read(new char[1], 0, 1)); + Assert.AreEqual(-1, reader.Read(new char[2], 1, 1)); + //Assert.AreEqual(-1, reader.Read(CharBuffer.wrap(new char[2]))); + + reader.SetValue("foobar"); + Assert.AreEqual(4, reader.Read(buf, 0, 4)); + Assert.AreEqual("foob", new string(buf)); + Assert.AreEqual(2, reader.Read(buf, 0, 2)); + Assert.AreEqual("ar", new string(buf, 0, 2)); + Assert.AreEqual(-1, reader.Read(buf, 2, 0)); + } + + using (ReusableStringReader reader = new ReusableStringReader()) + { + reader.SetValue("foobar"); + Assert.AreEqual(0, reader.Read(buf, 1, 0)); + Assert.AreEqual(3, reader.Read(buf, 1, 3)); + Assert.AreEqual("foo", new string(buf, 1, 3)); + Assert.AreEqual(2, reader.Read(buf, 2, 2)); + Assert.AreEqual("ba", new string(buf, 2, 2)); + Assert.AreEqual('r', (char)reader.Read()); + Assert.AreEqual(-1, reader.Read(buf, 2, 0)); + reader.Dispose(); + + reader.SetValue("foobar"); + StringBuilder sb = new StringBuilder(); + int ch; + while ((ch = reader.Read()) != -1) + { + sb.Append((char)ch); + } + Assert.AreEqual("foobar", sb.ToString()); + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/96822396/src/Lucene.Net.Tests/Analysis/TestToken.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests/Analysis/TestToken.cs b/src/Lucene.Net.Tests/Analysis/TestToken.cs new file mode 100644 index 0000000..5e3fa66 --- /dev/null +++ b/src/Lucene.Net.Tests/Analysis/TestToken.cs @@ -0,0 +1,305 @@ +using System.Collections.Generic; +using System.Text; + +namespace Lucene.Net.Analysis +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + using Lucene.Net.Analysis.TokenAttributes; + using NUnit.Framework; + using Attribute = Lucene.Net.Util.Attribute; + using BytesRef = Lucene.Net.Util.BytesRef; + using IAttribute = Lucene.Net.Util.IAttribute; + using LuceneTestCase = Lucene.Net.Util.LuceneTestCase; + using TestUtil = Lucene.Net.Util.TestUtil; + + [TestFixture] + public class TestToken : LuceneTestCase + { + [Test] + public virtual void TestCtor() + { + Token t = new Token(); + char[] content = "hello".ToCharArray(); + t.CopyBuffer(content, 0, content.Length); + Assert.AreNotSame(t.Buffer, content); + Assert.AreEqual(0, t.StartOffset); + Assert.AreEqual(0, t.EndOffset); + Assert.AreEqual("hello", t.ToString()); + Assert.AreEqual("word", t.Type); + Assert.AreEqual(0, t.Flags); + + t = new Token(6, 22); + t.CopyBuffer(content, 0, content.Length); + Assert.AreEqual("hello", t.ToString()); + Assert.AreEqual("hello", t.ToString()); + Assert.AreEqual(6, t.StartOffset); + Assert.AreEqual(22, t.EndOffset); + Assert.AreEqual("word", t.Type); + Assert.AreEqual(0, t.Flags); + + t = new Token(6, 22, 7); + t.CopyBuffer(content, 0, content.Length); + Assert.AreEqual("hello", t.ToString()); + Assert.AreEqual("hello", t.ToString()); + Assert.AreEqual(6, t.StartOffset); + Assert.AreEqual(22, t.EndOffset); + Assert.AreEqual("word", t.Type); + Assert.AreEqual(7, t.Flags); + + t = new Token(6, 22, "junk"); + t.CopyBuffer(content, 0, content.Length); + Assert.AreEqual("hello", t.ToString()); + Assert.AreEqual("hello", t.ToString()); + Assert.AreEqual(6, t.StartOffset); + Assert.AreEqual(22, t.EndOffset); + Assert.AreEqual("junk", t.Type); + Assert.AreEqual(0, t.Flags); + } + + [Test] + public virtual void TestResize() + { + Token t = new Token(); + char[] content = "hello".ToCharArray(); + t.CopyBuffer(content, 0, content.Length); + for (int i = 0; i < 2000; i++) + { + t.ResizeBuffer(i); + Assert.IsTrue(i <= t.Buffer.Length); + Assert.AreEqual("hello", t.ToString()); + } + } + + [Test] + public virtual void TestGrow() + { + Token t = new Token(); + StringBuilder buf = new StringBuilder("ab"); + for (int i = 0; i < 20; i++) + { + char[] content = buf.ToString().ToCharArray(); + t.CopyBuffer(content, 0, content.Length); + Assert.AreEqual(buf.Length, t.Length); + Assert.AreEqual(buf.ToString(), t.ToString()); + buf.Append(buf.ToString()); + } + Assert.AreEqual(1048576, t.Length); + + // now as a string, second variant + t = new Token(); + buf = new StringBuilder("ab"); + for (int i = 0; i < 20; i++) + { + t.SetEmpty().Append(buf); + string content = buf.ToString(); + Assert.AreEqual(content.Length, t.Length); + Assert.AreEqual(content, t.ToString()); + buf.Append(content); + } + Assert.AreEqual(1048576, t.Length); + + // Test for slow growth to a long term + t = new Token(); + buf = new StringBuilder("a"); + for (int i = 0; i < 20000; i++) + { + t.SetEmpty().Append(buf); + string content = buf.ToString(); + Assert.AreEqual(content.Length, t.Length); + Assert.AreEqual(content, t.ToString()); + buf.Append("a"); + } + Assert.AreEqual(20000, t.Length); + + // Test for slow growth to a long term + t = new Token(); + buf = new StringBuilder("a"); + for (int i = 0; i < 20000; i++) + { + t.SetEmpty().Append(buf); + string content = buf.ToString(); + Assert.AreEqual(content.Length, t.Length); + Assert.AreEqual(content, t.ToString()); + buf.Append("a"); + } + Assert.AreEqual(20000, t.Length); + } + + [Test] + public virtual void TestToString() + { + char[] b = new char[] { 'a', 'l', 'o', 'h', 'a' }; + Token t = new Token("", 0, 5); + t.CopyBuffer(b, 0, 5); + Assert.AreEqual("aloha", t.ToString()); + + t.SetEmpty().Append("hi there"); + Assert.AreEqual("hi there", t.ToString()); + } + + [Test] + public virtual void TestTermBufferEquals() + { + Token t1a = new Token(); + char[] content1a = "hello".ToCharArray(); + t1a.CopyBuffer(content1a, 0, 5); + Token t1b = new Token(); + char[] content1b = "hello".ToCharArray(); + t1b.CopyBuffer(content1b, 0, 5); + Token t2 = new Token(); + char[] content2 = "hello2".ToCharArray(); + t2.CopyBuffer(content2, 0, 6); + Assert.IsTrue(t1a.Equals(t1b)); + Assert.IsFalse(t1a.Equals(t2)); + Assert.IsFalse(t2.Equals(t1b)); + } + + [Test] + public virtual void TestMixedStringArray() + { + Token t = new Token("hello", 0, 5); + Assert.AreEqual(t.Length, 5); + Assert.AreEqual(t.ToString(), "hello"); + t.SetEmpty().Append("hello2"); + Assert.AreEqual(t.Length, 6); + Assert.AreEqual(t.ToString(), "hello2"); + t.CopyBuffer("hello3".ToCharArray(), 0, 6); + Assert.AreEqual(t.ToString(), "hello3"); + + char[] buffer = t.Buffer; + buffer[1] = 'o'; + Assert.AreEqual(t.ToString(), "hollo3"); + } + + [Test] + public virtual void TestClone() + { + Token t = new Token(0, 5); + char[] content = "hello".ToCharArray(); + t.CopyBuffer(content, 0, 5); + char[] buf = t.Buffer; + Token copy = AssertCloneIsEqual(t); + Assert.AreEqual(t.ToString(), copy.ToString()); + Assert.AreNotSame(buf, copy.Buffer); + + BytesRef pl = new BytesRef(new byte[] { 1, 2, 3, 4 }); + t.Payload = pl; + copy = AssertCloneIsEqual(t); + Assert.AreEqual(pl, copy.Payload); + Assert.AreNotSame(pl, copy.Payload); + } + + [Test] + public virtual void TestCopyTo() + { + Token t = new Token(); + Token copy = AssertCopyIsEqual(t); + Assert.AreEqual("", t.ToString()); + Assert.AreEqual("", copy.ToString()); + + t = new Token(0, 5); + char[] content = "hello".ToCharArray(); + t.CopyBuffer(content, 0, 5); + char[] buf = t.Buffer; + copy = AssertCopyIsEqual(t); + Assert.AreEqual(t.ToString(), copy.ToString()); + Assert.AreNotSame(buf, copy.Buffer); + + BytesRef pl = new BytesRef(new byte[] { 1, 2, 3, 4 }); + t.Payload = pl; + copy = AssertCopyIsEqual(t); + Assert.AreEqual(pl, copy.Payload); + Assert.AreNotSame(pl, copy.Payload); + } + + public interface ISenselessAttribute : Lucene.Net.Util.IAttribute + { + } + + public sealed class SenselessAttribute : Attribute, ISenselessAttribute + { + public override void CopyTo(IAttribute target) + { + } + + public override void Clear() + { + } + + public override bool Equals(object o) + { + return (o is SenselessAttribute); + } + + public override int GetHashCode() + { + return 0; + } + } + + [Test] + public virtual void TestTokenAttributeFactory() + { + TokenStream ts = new MockTokenizer(Token.TOKEN_ATTRIBUTE_FACTORY, new System.IO.StringReader("foo bar"), MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); + + Assert.IsTrue(ts.AddAttribute<ISenselessAttribute>() is SenselessAttribute, "SenselessAttribute is not implemented by SenselessAttributeImpl"); + + Assert.IsTrue(ts.AddAttribute<ICharTermAttribute>() is Token, "CharTermAttribute is not implemented by Token"); + Assert.IsTrue(ts.AddAttribute<IOffsetAttribute>() is Token, "OffsetAttribute is not implemented by Token"); + Assert.IsTrue(ts.AddAttribute<IFlagsAttribute>() is Token, "FlagsAttribute is not implemented by Token"); + Assert.IsTrue(ts.AddAttribute<IPayloadAttribute>() is Token, "PayloadAttribute is not implemented by Token"); + Assert.IsTrue(ts.AddAttribute<IPositionIncrementAttribute>() is Token, "PositionIncrementAttribute is not implemented by Token"); + Assert.IsTrue(ts.AddAttribute<ITypeAttribute>() is Token, "TypeAttribute is not implemented by Token"); + } + + [Test] + public virtual void TestAttributeReflection() + { + Token t = new Token("foobar", 6, 22, 8); + TestUtil.AssertAttributeReflection(t, new Dictionary<string, object>() + { + { typeof(ICharTermAttribute).Name + "#term", "foobar" }, + { typeof(ITermToBytesRefAttribute).Name + "#bytes", new BytesRef("foobar") }, + { typeof(IOffsetAttribute).Name + "#startOffset", 6 }, + { typeof(IOffsetAttribute).Name + "#endOffset", 22 }, + { typeof(IPositionIncrementAttribute).Name + "#positionIncrement", 1 }, + { typeof(IPayloadAttribute).Name + "#payload", null }, + { typeof(ITypeAttribute).Name + "#type", TypeAttribute_Fields.DEFAULT_TYPE }, + { typeof(IFlagsAttribute).Name + "#flags", 8 } + }); + } + + public static T AssertCloneIsEqual<T>(T att) where T : Attribute + { + T clone = (T)att.Clone(); + Assert.AreEqual(att, clone, "Clone must be equal"); + Assert.AreEqual(att.GetHashCode(), clone.GetHashCode(), "Clone's hashcode must be equal"); + return clone; + } + + public static T AssertCopyIsEqual<T>(T att) where T : Attribute + { + T copy = (T)System.Activator.CreateInstance(att.GetType()); + att.CopyTo(copy); + Assert.AreEqual(att, copy, "Copied instance must be equal"); + Assert.AreEqual(att.GetHashCode(), copy.GetHashCode(), "Copied instance's hashcode must be equal"); + return copy; + } + } +} \ No newline at end of file
