Ported tests/MockTokenizer
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/6e9d73f4 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/6e9d73f4 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/6e9d73f4 Branch: refs/heads/branch_4x Commit: 6e9d73f4ac8bcbc1b0ae23dc4c32e5ca249c5be8 Parents: 5ecbe92 Author: synhershko <[email protected]> Authored: Tue Apr 8 01:07:44 2014 +0300 Committer: synhershko <[email protected]> Committed: Tue Apr 8 01:07:44 2014 +0300 ---------------------------------------------------------------------- test/test-framework/Analysis/MockTokenizer.cs | 285 +++++++++++++++++++ .../Lucene.Net.TestFramework.csproj | 3 +- 2 files changed, 287 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/6e9d73f4/test/test-framework/Analysis/MockTokenizer.cs ---------------------------------------------------------------------- diff --git a/test/test-framework/Analysis/MockTokenizer.cs b/test/test-framework/Analysis/MockTokenizer.cs new file mode 100644 index 0000000..0cd2942 --- /dev/null +++ b/test/test-framework/Analysis/MockTokenizer.cs @@ -0,0 +1,285 @@ +using System; +using Lucene.Net.Analysis.Tokenattributes; +using Lucene.Net.Randomized; +using Lucene.Net.Support; +using Lucene.Net.Util; +using Lucene.Net.Util.Automaton; + +namespace Lucene.Net.Analysis +{ + /** + * Tokenizer for testing. + * <p> + * This tokenizer is a replacement for {@link #WHITESPACE}, {@link #SIMPLE}, and {@link #KEYWORD} + * tokenizers. If you are writing a component such as a TokenFilter, its a great idea to test + * it wrapping this tokenizer instead for extra checks. This tokenizer has the following behavior: + * <ul> + * <li>An internal state-machine is used for checking consumer consistency. These checks can + * be disabled with {@link #setEnableChecks(boolean)}. + * <li>For convenience, optionally lowercases terms that it outputs. + * </ul> + */ + public class MockTokenizer : Tokenizer + { + /** Acts Similar to WhitespaceTokenizer */ + public static CharacterRunAutomaton WHITESPACE = + new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").ToAutomaton()); + /** Acts Similar to KeywordTokenizer. + * TODO: Keyword returns an "empty" token for an empty reader... + */ + public static CharacterRunAutomaton KEYWORD = + new CharacterRunAutomaton(new RegExp(".*").ToAutomaton()); + /** Acts like LetterTokenizer. */ + // the ugly regex below is incomplete Unicode 5.2 [:Letter:] + public static CharacterRunAutomaton SIMPLE = + new CharacterRunAutomaton(new RegExp("[A-Za-zªµºÃ-ÃÃ-öø-Ëä¸-é¿]+").ToAutomaton()); + + private CharacterRunAutomaton runAutomaton; + private bool lowerCase; + private int maxTokenLength; + public static int DEFAULT_MAX_TOKEN_LENGTH = int.MaxValue; + private int state; + + private readonly CharTermAttribute termAtt; + private readonly OffsetAttribute offsetAtt; + int off = 0; + + // TODO: "register" with LuceneTestCase to ensure all streams are closed() ? + // currently, we can only check that the lifecycle is correct if someone is reusing, + // but not for "one-offs". + private enum State + { + SETREADER, // consumer set a reader input either via ctor or via reset(Reader) + RESET, // consumer has called reset() + INCREMENT, // consumer is consuming, has called incrementToken() == true + INCREMENT_FALSE, // consumer has called incrementToken() which returned false + END, // consumer has called end() to perform end of stream operations + CLOSE // consumer has called close() to release any resources + }; + + private State streamState = State.CLOSE; + private int lastOffset = 0; // only for asserting + private bool enableChecks = true; + + // evil: but we don't change the behavior with this random, we only switch up how we read + private Random random = new Random(/*RandomizedContext.Current.getRandom().nextLong()*/); + + public MockTokenizer(AttributeSource.AttributeFactory factory, System.IO.TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength) + : base(factory, input) + { + this.runAutomaton = runAutomaton; + this.lowerCase = lowerCase; + this.state = runAutomaton.InitialState; + this.streamState = State.SETREADER; + this.maxTokenLength = maxTokenLength; + + termAtt = AddAttribute<CharTermAttribute>(); + offsetAtt = AddAttribute<OffsetAttribute>(); + } + + public MockTokenizer(System.IO.TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength) : + this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, runAutomaton, lowerCase, maxTokenLength) + { + } + + public MockTokenizer(System.IO.TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase) : + this(input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH) + { + } + /** Calls {@link #MockTokenizer(Reader, CharacterRunAutomaton, boolean) MockTokenizer(Reader, WHITESPACE, true)} */ + public MockTokenizer(System.IO.TextReader input) : + this(input, WHITESPACE, true) + { + } + + public MockTokenizer(AttributeFactory factory, System.IO.TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase) : + this(factory, input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH) + { + } + + /** Calls {@link #MockTokenizer(org.apache.lucene.util.AttributeSource.AttributeFactory,Reader,CharacterRunAutomaton,boolean) + * MockTokenizer(AttributeFactory, Reader, WHITESPACE, true)} */ + + public MockTokenizer(AttributeFactory factory, System.IO.TextReader input) : + this(input, WHITESPACE, true) + { + + } + + public override bool IncrementToken() + { + // assert !enableChecks || (streamState == State.RESET || streamState == State.INCREMENT) + // : "incrementToken() called while in wrong state: " + streamState; + ClearAttributes(); + for (; ; ) + { + int startOffset = off; + int cp = readCodePoint(); + if (cp < 0) + { + break; + } + else if (isTokenChar(cp)) + { + int endOffset; + do + { + char[] chars = Character.ToChars(Normalize(cp)); + for (int i = 0; i < chars.Length; i++) + termAtt.Append(chars[i]); + endOffset = off; + if (termAtt.Length >= maxTokenLength) + { + break; + } + cp = readCodePoint(); + } while (cp >= 0 && isTokenChar(cp)); + + int correctedStartOffset = CorrectOffset(startOffset); + int correctedEndOffset = CorrectOffset(endOffset); + // assert correctedStartOffset >= 0; + // assert correctedEndOffset >= 0; + // assert correctedStartOffset >= lastOffset; + lastOffset = correctedStartOffset; + // assert correctedEndOffset >= correctedStartOffset; + offsetAtt.SetOffset(correctedStartOffset, correctedEndOffset); + streamState = State.INCREMENT; + return true; + } + } + streamState = State.INCREMENT_FALSE; + return false; + } + + protected int readCodePoint() + { + int ch = ReadChar(); + if (ch < 0) + { + return ch; + } + else + { + //assert !Character.isLowSurrogate((char) ch) : "unpaired low surrogate: " + Integer.toHexString(ch); + off++; + if (Character.IsHighSurrogate((char)ch)) + { + int ch2 = ReadChar(); + if (ch2 >= 0) + { + off++; + //assert Character.isLowSurrogate((char) ch2) : "unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2); + return Character.ToCodePoint((char)ch, (char)ch2); + } + else + { + //assert false : "stream ends with unpaired high surrogate: " + Integer.toHexString(ch); + } + } + return ch; + } + } + + protected int ReadChar() + { + switch (random.Next(0, 10)) + { + case 0: + { + // read(char[]) + char[] c = new char[1]; + int ret = input.Read(c, 0, c.Length); + return ret < 0 ? ret : c[0]; + } + case 1: + { + // read(char[], int, int) + char[] c = new char[2]; + int ret = input.Read(c, 1, 1); + return ret < 0 ? ret : c[1]; + } + // case 2: { + // // read(CharBuffer) + // char[] c = new char[1]; + // CharBuffer cb = CharBuffer.wrap(c); + // int ret = input.Read(cb); + // return ret < 0 ? ret : c[0]; + // } + default: + // read() + return input.Read(); + } + } + + protected bool isTokenChar(int c) + { + state = runAutomaton.Step(state, c); + if (state < 0) + { + state = runAutomaton.InitialState; + return false; + } + else + { + return true; + } + } + + protected int Normalize(int c) + { + return lowerCase ? Character.ToLowerCase(c) : c; + } + + public override void Reset() + { + base.Reset(); + state = runAutomaton.InitialState; + lastOffset = off = 0; + //assert !enableChecks || streamState != State.RESET : "double reset()"; + streamState = State.RESET; + } + + protected virtual void Dispose(bool disposing) + { + base.Dispose(disposing); + // in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close() + // these tests should disable this check, by default we check the normal workflow. + // TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this + //assert !enableChecks || streamState == State.END || streamState == State.CLOSE : "close() called in wrong state: " + streamState; + streamState = State.CLOSE; + } + + bool setReaderTestPoint() + { + //assert !enableChecks || streamState == State.CLOSE : "setReader() called in wrong state: " + streamState; + streamState = State.SETREADER; + return true; + } + + public override void End() + { + int finalOffset = CorrectOffset(off); + offsetAtt.SetOffset(finalOffset, finalOffset); + // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false. + // these tests should disable this check (in general you should consume the entire stream) + try + { + //assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!"; + } + finally + { + streamState = State.END; + } + } + + /** + * Toggle consumer workflow checking: if your test consumes tokenstreams normally you + * should leave this enabled. + */ + public void setEnableChecks(bool enableChecks) + { + this.enableChecks = enableChecks; + } + } + +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/6e9d73f4/test/test-framework/Lucene.Net.TestFramework.csproj ---------------------------------------------------------------------- diff --git a/test/test-framework/Lucene.Net.TestFramework.csproj b/test/test-framework/Lucene.Net.TestFramework.csproj index 14d381f..6b31aaa 100644 --- a/test/test-framework/Lucene.Net.TestFramework.csproj +++ b/test/test-framework/Lucene.Net.TestFramework.csproj @@ -56,6 +56,7 @@ <Reference Include="System.Xml" /> </ItemGroup> <ItemGroup> + <Compile Include="Analysis\MockTokenizer.cs" /> <Compile Include="JavaCompatibility\LuceneTestCase.cs" /> <Compile Include="JavaCompatibility\LuceneTypesHelpers.cs" /> <Compile Include="JavaCompatibility\SystemTypesHelpers.cs" /> @@ -85,7 +86,7 @@ </ProjectReference> </ItemGroup> <ItemGroup> - <Folder Include="Analysis\" /> + <Folder Include="Index\" /> </ItemGroup> <ItemGroup> <None Include="Lucene.Net.snk" />
