http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/PorterStemmer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/PorterStemmer.cs b/src/Lucene.Net.Core/Analysis/PorterStemmer.cs deleted file mode 100644 index bc4cf75..0000000 --- a/src/Lucene.Net.Core/Analysis/PorterStemmer.cs +++ /dev/null @@ -1,746 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - -Porter stemmer in Java. The original paper is in - -Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, -no. 3, pp 130-137, - -See also http://www.tartarus.org/~martin/PorterStemmer/index.html - -Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. -Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] -is then out outside the bounds of b. - -Similarly, - -Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. -'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and -b[j] is then outside the bounds of b. - -Release 3. - -[ This version is derived from Release 3, modified by Brian Goetz to -optimize for fewer object creations. ] -*/ -using System; -namespace Lucene.Net.Analysis -{ - - /// <summary> - /// Stemmer, implementing the Porter Stemming Algorithm - /// - /// The Stemmer class transforms a word into its root form. The input - /// word can be provided a character at time (by calling add()), or at once - /// by calling one of the various stem(something) methods. - /// </summary> - - class PorterStemmer - { - private char[] b; - private int i, j, k, k0; - private bool dirty = false; - private const int INC = 50; /* unit of size whereby b is increased */ - private const int EXTRA = 1; - - public PorterStemmer() - { - b = new char[INC]; - i = 0; - } - - /// <summary> reset() resets the stemmer so it can stem another word. If you invoke - /// the stemmer by calling add(char) and then stem(), you must call reset() - /// before starting another word. - /// </summary> - public virtual void Reset() - { - i = 0; dirty = false; - } - - /// <summary> Add a character to the word being stemmed. When you are finished - /// adding characters, you can call stem(void) to process the word. - /// </summary> - public virtual void Add(char ch) - { - if (b.Length <= i + EXTRA) - { - var new_b = new char[b.Length + INC]; - Array.Copy(b, 0, new_b, 0, b.Length); - b = new_b; - } - b[i++] = ch; - } - - /// <summary> After a word has been stemmed, it can be retrieved by toString(), - /// or a reference to the internal buffer can be retrieved by getResultBuffer - /// and getResultLength (which is generally more efficient.) - /// </summary> - public override System.String ToString() - { - return new System.String(b, 0, i); - } - - /// <summary> Returns the length of the word resulting from the stemming process.</summary> - public virtual int ResultLength - { - get { return i; } - } - - /// <summary> Returns a reference to a character buffer containing the results of - /// the stemming process. You also need to consult getResultLength() - /// to determine the length of the result. - /// </summary> - public virtual char[] ResultBuffer - { - get { return b; } - } - - /* cons(i) is true <=> b[i] is a consonant. */ - - private bool Cons(int i) - { - switch (b[i]) - { - - case 'a': - case 'e': - case 'i': - case 'o': - case 'u': - return false; - - case 'y': - return (i == k0)?true:!Cons(i - 1); - - default: - return true; - - } - } - - /* m() measures the number of consonant sequences between k0 and j. if c is - a consonant sequence and v a vowel sequence, and <..> indicates arbitrary - presence, - - <c><v> gives 0 - <c>vc<v> gives 1 - <c>vcvc<v> gives 2 - <c>vcvcvc<v> gives 3 - .... - */ - - private int M() - { - int n = 0; - int i = k0; - while (true) - { - if (i > j) - return n; - if (!Cons(i)) - break; - i++; - } - i++; - while (true) - { - while (true) - { - if (i > j) - return n; - if (Cons(i)) - break; - i++; - } - i++; - n++; - while (true) - { - if (i > j) - return n; - if (!Cons(i)) - break; - i++; - } - i++; - } - } - - /* vowelinstem() is true <=> k0,...j contains a vowel */ - - private bool Vowelinstem() - { - int i; - for (i = k0; i <= j; i++) - if (!Cons(i)) - return true; - return false; - } - - /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ - - private bool Doublec(int j) - { - if (j < k0 + 1) - return false; - if (b[j] != b[j - 1]) - return false; - return Cons(j); - } - - /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant - and also if the second c is not w,x or y. this is used when trying to - restore an e at the end of a short word. e.g. - - cav(e), lov(e), hop(e), crim(e), but - snow, box, tray. - - */ - - private bool Cvc(int i) - { - if (i < k0 + 2 || !Cons(i) || Cons(i - 1) || !Cons(i - 2)) - return false; - else - { - int ch = b[i]; - if (ch == 'w' || ch == 'x' || ch == 'y') - return false; - } - return true; - } - - private bool Ends(System.String s) - { - int l = s.Length; - int o = k - l + 1; - if (o < k0) - return false; - for (int i = 0; i < l; i++) - if (b[o + i] != s[i]) - return false; - j = k - l; - return true; - } - - /* setto(s) sets (j+1),...k to the characters in the string s, readjusting - k. */ - - internal virtual void Setto(System.String s) - { - int l = s.Length; - int o = j + 1; - for (int i = 0; i < l; i++) - b[o + i] = s[i]; - k = j + l; - dirty = true; - } - - /* r(s) is used further down. */ - - internal virtual void R(System.String s) - { - if (M() > 0) - Setto(s); - } - - /* step1() gets rid of plurals and -ed or -ing. e.g. - - caresses -> caress - ponies -> poni - ties -> ti - caress -> caress - cats -> cat - - feed -> feed - agreed -> agree - disabled -> disable - - matting -> mat - mating -> mate - meeting -> meet - milling -> mill - messing -> mess - - meetings -> meet - - */ - - private void Step1() - { - if (b[k] == 's') - { - if (Ends("sses")) - k -= 2; - else if (Ends("ies")) - Setto("i"); - else if (b[k - 1] != 's') - k--; - } - if (Ends("eed")) - { - if (M() > 0) - k--; - } - else if ((Ends("ed") || Ends("ing")) && Vowelinstem()) - { - k = j; - if (Ends("at")) - Setto("ate"); - else if (Ends("bl")) - Setto("ble"); - else if (Ends("iz")) - Setto("ize"); - else if (Doublec(k)) - { - int ch = b[k--]; - if (ch == 'l' || ch == 's' || ch == 'z') - k++; - } - else if (M() == 1 && Cvc(k)) - Setto("e"); - } - } - - /* step2() turns terminal y to i when there is another vowel in the stem. */ - - private void Step2() - { - if (Ends("y") && Vowelinstem()) - { - b[k] = 'i'; - dirty = true; - } - } - - /* step3() maps double suffices to single ones. so -ization ( = -ize plus - -ation) maps to -ize etc. note that the string before the suffix must give - m() > 0. */ - - private void Step3() - { - if (k == k0) - return ; /* For Bug 1 */ - switch (b[k - 1]) - { - - case 'a': - if (Ends("ational")) - { - R("ate"); break; - } - if (Ends("tional")) - { - R("tion"); break; - } - break; - - case 'c': - if (Ends("enci")) - { - R("ence"); break; - } - if (Ends("anci")) - { - R("ance"); break; - } - break; - - case 'e': - if (Ends("izer")) - { - R("ize"); break; - } - break; - - case 'l': - if (Ends("bli")) - { - R("ble"); break; - } - if (Ends("alli")) - { - R("al"); break; - } - if (Ends("entli")) - { - R("ent"); break; - } - if (Ends("eli")) - { - R("e"); break; - } - if (Ends("ousli")) - { - R("ous"); break; - } - break; - - case 'o': - if (Ends("ization")) - { - R("ize"); break; - } - if (Ends("ation")) - { - R("ate"); break; - } - if (Ends("ator")) - { - R("ate"); break; - } - break; - - case 's': - if (Ends("alism")) - { - R("al"); break; - } - if (Ends("iveness")) - { - R("ive"); break; - } - if (Ends("fulness")) - { - R("ful"); break; - } - if (Ends("ousness")) - { - R("ous"); break; - } - break; - - case 't': - if (Ends("aliti")) - { - R("al"); break; - } - if (Ends("iviti")) - { - R("ive"); break; - } - if (Ends("biliti")) - { - R("ble"); break; - } - break; - - case 'g': - if (Ends("logi")) - { - R("log"); break; - } - break; - } - } - - /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ - - private void Step4() - { - switch (b[k]) - { - - case 'e': - if (Ends("icate")) - { - R("ic"); break; - } - if (Ends("ative")) - { - R(""); break; - } - if (Ends("alize")) - { - R("al"); break; - } - break; - - case 'i': - if (Ends("iciti")) - { - R("ic"); break; - } - break; - - case 'l': - if (Ends("ical")) - { - R("ic"); break; - } - if (Ends("ful")) - { - R(""); break; - } - break; - - case 's': - if (Ends("ness")) - { - R(""); break; - } - break; - } - } - - /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */ - - private void Step5() - { - if (k == k0) - return ; /* for Bug 1 */ - switch (b[k - 1]) - { - - case 'a': - if (Ends("al")) - break; - return ; - - case 'c': - if (Ends("ance")) - break; - if (Ends("ence")) - break; - return ; - - case 'e': - if (Ends("er")) - break; return ; - - case 'i': - if (Ends("ic")) - break; return ; - - case 'l': - if (Ends("able")) - break; - if (Ends("ible")) - break; return ; - - case 'n': - if (Ends("ant")) - break; - if (Ends("ement")) - break; - if (Ends("ment")) - break; - /* element etc. not stripped before the m */ - if (Ends("ent")) - break; - return ; - - case 'o': - if (Ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) - break; - /* j >= 0 fixes Bug 2 */ - if (Ends("ou")) - break; - return ; - /* takes care of -ous */ - - case 's': - if (Ends("ism")) - break; - return ; - - case 't': - if (Ends("ate")) - break; - if (Ends("iti")) - break; - return ; - - case 'u': - if (Ends("ous")) - break; - return ; - - case 'v': - if (Ends("ive")) - break; - return ; - - case 'z': - if (Ends("ize")) - break; - return ; - - default: - return ; - - } - if (M() > 1) - k = j; - } - - /* step6() removes a final -e if m() > 1. */ - - private void Step6() - { - j = k; - if (b[k] == 'e') - { - int a = M(); - if (a > 1 || a == 1 && !Cvc(k - 1)) - k--; - } - if (b[k] == 'l' && Doublec(k) && M() > 1) - k--; - } - - - /// <summary> Stem a word provided as a String. Returns the result as a String.</summary> - public virtual System.String Stem(System.String s) - { - if (Stem(s.ToCharArray(), s.Length)) - { - return ToString(); - } - else - return s; - } - - /// <summary>Stem a word contained in a char[]. Returns true if the stemming process - /// resulted in a word different from the input. You can retrieve the - /// result with getResultLength()/getResultBuffer() or toString(). - /// </summary> - public virtual bool Stem(char[] word) - { - return Stem(word, word.Length); - } - - /// <summary>Stem a word contained in a portion of a char[] array. Returns - /// true if the stemming process resulted in a word different from - /// the input. You can retrieve the result with - /// getResultLength()/getResultBuffer() or toString(). - /// </summary> - public virtual bool Stem(char[] wordBuffer, int offset, int wordLen) - { - Reset(); - if (b.Length < wordLen) - { - var new_b = new char[wordLen + EXTRA]; - b = new_b; - } - Array.Copy(wordBuffer, offset, b, 0, wordLen); - i = wordLen; - return Stem(0); - } - - /// <summary>Stem a word contained in a leading portion of a char[] array. - /// Returns true if the stemming process resulted in a word different - /// from the input. You can retrieve the result with - /// getResultLength()/getResultBuffer() or toString(). - /// </summary> - public virtual bool Stem(char[] word, int wordLen) - { - return Stem(word, 0, wordLen); - } - - /// <summary>Stem the word placed into the Stemmer buffer through calls to add(). - /// Returns true if the stemming process resulted in a word different - /// from the input. You can retrieve the result with - /// getResultLength()/getResultBuffer() or toString(). - /// </summary> - public virtual bool Stem() - { - return Stem(0); - } - - public virtual bool Stem(int i0) - { - k = i - 1; - k0 = i0; - if (k > k0 + 1) - { - Step1(); Step2(); Step3(); Step4(); Step5(); Step6(); - } - // Also, a word is considered dirty if we lopped off letters - // Thanks to Ifigenia Vairelles for pointing this out. - if (i != k + 1) - dirty = true; - i = k + 1; - return dirty; - } - - /// <summary>Test program for demonstrating the Stemmer. It reads a file and - /// stems each word, writing the result to standard out. - /// Usage: Stemmer file-name - /// </summary> - [STAThread] - public static void Main(System.String[] args) - { - var s = new PorterStemmer(); - - for (int i = 0; i < args.Length; i++) - { - try - { - System.IO.Stream in_Renamed = new System.IO.FileStream(args[i], System.IO.FileMode.Open, System.IO.FileAccess.Read); - var buffer = new byte[1024]; - - int bufferLen = in_Renamed.Read(buffer, 0, buffer.Length); - int offset = 0; - s.Reset(); - - while (true) - { - int ch; - if (offset < bufferLen) - ch = buffer[offset++]; - else - { - bufferLen = in_Renamed.Read(buffer, 0, buffer.Length); - offset = 0; - if (bufferLen < 0) - ch = - 1; - else - ch = buffer[offset++]; - } - - if (Char.IsLetter((char) ch)) - { - s.Add(Char.ToLower((char) ch)); - } - else - { - s.Stem(); - Console.Out.Write(s.ToString()); - s.Reset(); - if (ch < 0) - break; - else - { - System.Console.Out.Write((char) ch); - } - } - } - - in_Renamed.Close(); - } - catch (System.IO.IOException) - { - Console.Out.WriteLine("error reading " + args[i]); - } - } - } - } -} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/SimpleAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/SimpleAnalyzer.cs b/src/Lucene.Net.Core/Analysis/SimpleAnalyzer.cs deleted file mode 100644 index 50bc9c1..0000000 --- a/src/Lucene.Net.Core/Analysis/SimpleAnalyzer.cs +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -namespace Lucene.Net.Analysis -{ - - /// <summary>An <see cref="Analyzer" /> that filters <see cref="LetterTokenizer" /> - /// with <see cref="LowerCaseFilter" /> - /// </summary> - - public sealed class SimpleAnalyzer : Analyzer - { - public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) - { - return new LowerCaseTokenizer(reader); - } - - public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) - { - var tokenizer = (Tokenizer) PreviousTokenStream; - if (tokenizer == null) - { - tokenizer = new LowerCaseTokenizer(reader); - PreviousTokenStream = tokenizer; - } - else - tokenizer.Reset(reader); - return tokenizer; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/Standard/READ_BEFORE_REGENERATING.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/Standard/READ_BEFORE_REGENERATING.txt b/src/Lucene.Net.Core/Analysis/Standard/READ_BEFORE_REGENERATING.txt deleted file mode 100644 index 7519d54..0000000 --- a/src/Lucene.Net.Core/Analysis/Standard/READ_BEFORE_REGENERATING.txt +++ /dev/null @@ -1,25 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - - -WARNING: if you change StandardTokenizerImpl.jflex and need to regenerate - the tokenizer, only use Java 1.4 !!! - This grammar currently uses constructs (eg :digit:, :letter:) whose - meaning can vary according to the JRE used to run jflex. See - https://issues.apache.org/jira/browse/LUCENE-1126 for details. - For current backwards compatibility it is needed to support - only Java 1.4 - this will change in Lucene 3.1. http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/Standard/StandardAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/Standard/StandardAnalyzer.cs b/src/Lucene.Net.Core/Analysis/Standard/StandardAnalyzer.cs deleted file mode 100644 index bf704be..0000000 --- a/src/Lucene.Net.Core/Analysis/Standard/StandardAnalyzer.cs +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections; -using System.Collections.Generic; -using Lucene.Net.Analysis; -using Lucene.Net.Util; -using Version = Lucene.Net.Util.Version; - -namespace Lucene.Net.Analysis.Standard -{ - - /// <summary> Filters <see cref="StandardTokenizer" /> with <see cref="StandardFilter" />, - /// <see cref="LowerCaseFilter" /> and <see cref="StopFilter" />, using a list of English stop - /// words. - /// - /// <a name="version"/> - /// <p/> - /// You must specify the required <see cref="Version" /> compatibility when creating - /// StandardAnalyzer: - /// <list type="bullet"> - /// <item>As of 2.9, StopFilter preserves position increments</item> - /// <item>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see - /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>)</item> - /// </list> - /// </summary> - public class StandardAnalyzer : Analyzer - { - private ISet<string> stopSet; - - /// <summary> Specifies whether deprecated acronyms should be replaced with HOST type. - /// See <a href="https://issues.apache.org/jira/browse/LUCENE-1068">https://issues.apache.org/jira/browse/LUCENE-1068</a> - /// </summary> - private bool replaceInvalidAcronym, enableStopPositionIncrements; - - /// <summary>An unmodifiable set containing some common English words that are usually not - /// useful for searching. - /// </summary> - public static readonly ISet<string> STOP_WORDS_SET; - private Version matchVersion; - - /// <summary>Builds an analyzer with the default stop words (<see cref="STOP_WORDS_SET" />). - /// </summary> - /// <param name="matchVersion">Lucene version to match see <see cref="Version">above</see></param> - public StandardAnalyzer(Version matchVersion) - : this(matchVersion, STOP_WORDS_SET) - { } - - /// <summary>Builds an analyzer with the given stop words.</summary> - /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> /> - /// - /// </param> - /// <param name="stopWords">stop words - /// </param> - public StandardAnalyzer(Version matchVersion, ISet<string> stopWords) - { - stopSet = stopWords; - SetOverridesTokenStreamMethod<StandardAnalyzer>(); - enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); - replaceInvalidAcronym = matchVersion.OnOrAfter(Version.LUCENE_24); - this.matchVersion = matchVersion; - } - - /// <summary>Builds an analyzer with the stop words from the given file.</summary> - /// <seealso cref="WordlistLoader.GetWordSet(System.IO.FileInfo)"> - /// </seealso> - /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> /> - /// - /// </param> - /// <param name="stopwords">File to read stop words from - /// </param> - public StandardAnalyzer(Version matchVersion, System.IO.FileInfo stopwords) - : this (matchVersion, WordlistLoader.GetWordSet(stopwords)) - { - } - - /// <summary>Builds an analyzer with the stop words from the given reader.</summary> - /// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)"> - /// </seealso> - /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> /> - /// - /// </param> - /// <param name="stopwords">Reader to read stop words from - /// </param> - public StandardAnalyzer(Version matchVersion, System.IO.TextReader stopwords) - : this(matchVersion, WordlistLoader.GetWordSet(stopwords)) - { } - - /// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" /> - ///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />. - /// </summary> - public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) - { - StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader); - tokenStream.MaxTokenLength = maxTokenLength; - TokenStream result = new StandardFilter(tokenStream); - result = new LowerCaseFilter(result); - result = new StopFilter(enableStopPositionIncrements, result, stopSet); - return result; - } - - private sealed class SavedStreams - { - internal StandardTokenizer tokenStream; - internal TokenStream filteredTokenStream; - } - - /// <summary>Default maximum allowed token length </summary> - public const int DEFAULT_MAX_TOKEN_LENGTH = 255; - - private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; - - /// <summary> Set maximum allowed token length. If a token is seen - /// that exceeds this length then it is discarded. This - /// setting only takes effect the next time tokenStream or - /// reusableTokenStream is called. - /// </summary> - public virtual int MaxTokenLength - { - get { return maxTokenLength; } - set { maxTokenLength = value; } - } - - public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) - { - if (overridesTokenStreamMethod) - { - // LUCENE-1678: force fallback to tokenStream() if we - // have been subclassed and that subclass overrides - // tokenStream but not reusableTokenStream - return TokenStream(fieldName, reader); - } - SavedStreams streams = (SavedStreams) PreviousTokenStream; - if (streams == null) - { - streams = new SavedStreams(); - PreviousTokenStream = streams; - streams.tokenStream = new StandardTokenizer(matchVersion, reader); - streams.filteredTokenStream = new StandardFilter(streams.tokenStream); - streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); - streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements, - streams.filteredTokenStream, stopSet); - } - else - { - streams.tokenStream.Reset(reader); - } - streams.tokenStream.MaxTokenLength = maxTokenLength; - - streams.tokenStream.SetReplaceInvalidAcronym(replaceInvalidAcronym); - - return streams.filteredTokenStream; - } - static StandardAnalyzer() - { - STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/Standard/StandardFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/Standard/StandardFilter.cs b/src/Lucene.Net.Core/Analysis/Standard/StandardFilter.cs deleted file mode 100644 index f4a1c56..0000000 --- a/src/Lucene.Net.Core/Analysis/Standard/StandardFilter.cs +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using Lucene.Net.Analysis.Tokenattributes; -using Token = Lucene.Net.Analysis.Token; -using TokenFilter = Lucene.Net.Analysis.TokenFilter; -using TokenStream = Lucene.Net.Analysis.TokenStream; - -namespace Lucene.Net.Analysis.Standard -{ - - /// <summary>Normalizes tokens extracted with <see cref="StandardTokenizer" />. </summary> - - public sealed class StandardFilter:TokenFilter - { - - - /// <summary>Construct filtering <i>in</i>. </summary> - public StandardFilter(TokenStream in_Renamed):base(in_Renamed) - { - termAtt = AddAttribute<ITermAttribute>(); - typeAtt = AddAttribute<ITypeAttribute>(); - } - - private static readonly System.String APOSTROPHE_TYPE; - private static readonly System.String ACRONYM_TYPE; - - // this filters uses attribute type - private ITypeAttribute typeAtt; - private ITermAttribute termAtt; - - /// <summary>Returns the next token in the stream, or null at EOS. - /// <p/>Removes <tt>'s</tt> from the end of words. - /// <p/>Removes dots from acronyms. - /// </summary> - public override bool IncrementToken() - { - if (!input.IncrementToken()) - { - return false; - } - - char[] buffer = termAtt.TermBuffer(); - int bufferLength = termAtt.TermLength(); - System.String type = typeAtt.Type; - - if ((System.Object) type == (System.Object) APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) - { - // Strip last 2 characters off - termAtt.SetTermLength(bufferLength - 2); - } - else if ((System.Object) type == (System.Object) ACRONYM_TYPE) - { - // remove dots - int upto = 0; - for (int i = 0; i < bufferLength; i++) - { - char c = buffer[i]; - if (c != '.') - buffer[upto++] = c; - } - termAtt.SetTermLength(upto); - } - - return true; - } - static StandardFilter() - { - APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE]; - ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/Standard/StandardTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/Standard/StandardTokenizer.cs b/src/Lucene.Net.Core/Analysis/Standard/StandardTokenizer.cs deleted file mode 100644 index 8f25c7c..0000000 --- a/src/Lucene.Net.Core/Analysis/Standard/StandardTokenizer.cs +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Util; -using CharReader = Lucene.Net.Analysis.CharReader; -using Token = Lucene.Net.Analysis.Token; -using Tokenizer = Lucene.Net.Analysis.Tokenizer; -using AttributeSource = Lucene.Net.Util.AttributeSource; -using Version = Lucene.Net.Util.Version; - -namespace Lucene.Net.Analysis.Standard -{ - - /// <summary>A grammar-based tokenizer constructed with JFlex - /// - /// <p/> This should be a good tokenizer for most European-language documents: - /// - /// <list type="bullet"> - /// <item>Splits words at punctuation characters, removing punctuation. However, a - /// dot that's not followed by whitespace is considered part of a token.</item> - /// <item>Splits words at hyphens, unless there's a number in the token, in which case - /// the whole token is interpreted as a product number and is not split.</item> - /// <item>Recognizes email addresses and internet hostnames as one token.</item> - /// </list> - /// - /// <p/>Many applications have specific tokenizer needs. If this tokenizer does - /// not suit your application, please consider copying this source code - /// directory to your project and maintaining your own grammar-based tokenizer. - /// - /// <a name="version"/> - /// <p/> - /// You must specify the required <see cref="Version" /> compatibility when creating - /// StandardAnalyzer: - /// <list type="bullet"> - /// <item>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see - /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a></item> - /// </list> - /// </summary> - - public sealed class StandardTokenizer:Tokenizer - { - private void InitBlock() - { - maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; - } - /// <summary>A private instance of the JFlex-constructed scanner </summary> - private StandardTokenizerImpl scanner; - - public const int ALPHANUM = 0; - public const int APOSTROPHE = 1; - public const int ACRONYM = 2; - public const int COMPANY = 3; - public const int EMAIL = 4; - public const int HOST = 5; - public const int NUM = 6; - public const int CJ = 7; - - /// <deprecated> this solves a bug where HOSTs that end with '.' are identified - /// as ACRONYMs. - /// </deprecated> - [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs.")] - public const int ACRONYM_DEP = 8; - - /// <summary>String token types that correspond to token type int constants </summary> - public static readonly System.String[] TOKEN_TYPES = new System.String[]{"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"}; - - private bool replaceInvalidAcronym; - - private int maxTokenLength; - - /// <summary>Set the max allowed token length. Any token longer - /// than this is skipped. - /// </summary> - public int MaxTokenLength - { - get { return maxTokenLength; } - set { this.maxTokenLength = value; } - } - - /// <summary> Creates a new instance of the - /// <see cref="Lucene.Net.Analysis.Standard.StandardTokenizer" />. Attaches - /// the <c>input</c> to the newly created JFlex scanner. - /// - /// </summary> - /// <param name="matchVersion"></param> - /// <param name="input">The input reader - /// - /// See http://issues.apache.org/jira/browse/LUCENE-1068 - /// </param> - public StandardTokenizer(Version matchVersion, System.IO.TextReader input):base() - { - InitBlock(); - this.scanner = new StandardTokenizerImpl(input); - Init(input, matchVersion); - } - - /// <summary> Creates a new StandardTokenizer with a given <see cref="AttributeSource" />.</summary> - public StandardTokenizer(Version matchVersion, AttributeSource source, System.IO.TextReader input):base(source) - { - InitBlock(); - this.scanner = new StandardTokenizerImpl(input); - Init(input, matchVersion); - } - - /// <summary> Creates a new StandardTokenizer with a given - /// <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" /> - /// </summary> - public StandardTokenizer(Version matchVersion, AttributeFactory factory, System.IO.TextReader input):base(factory) - { - InitBlock(); - this.scanner = new StandardTokenizerImpl(input); - Init(input, matchVersion); - } - - private void Init(System.IO.TextReader input, Version matchVersion) - { - if (matchVersion.OnOrAfter(Version.LUCENE_24)) - { - replaceInvalidAcronym = true; - } - else - { - replaceInvalidAcronym = false; - } - this.input = input; - termAtt = AddAttribute<ITermAttribute>(); - offsetAtt = AddAttribute<IOffsetAttribute>(); - posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); - typeAtt = AddAttribute<ITypeAttribute>(); - } - - // this tokenizer generates three attributes: - // offset, positionIncrement and type - private ITermAttribute termAtt; - private IOffsetAttribute offsetAtt; - private IPositionIncrementAttribute posIncrAtt; - private ITypeAttribute typeAtt; - - ///<summary> - /// (non-Javadoc) - /// <see cref="Lucene.Net.Analysis.TokenStream.IncrementToken()" /> - ///</summary> - public override bool IncrementToken() - { - ClearAttributes(); - int posIncr = 1; - - while (true) - { - int tokenType = scanner.GetNextToken(); - - if (tokenType == StandardTokenizerImpl.YYEOF) - { - return false; - } - - if (scanner.Yylength() <= maxTokenLength) - { - posIncrAtt.PositionIncrement = posIncr; - scanner.GetText(termAtt); - int start = scanner.Yychar(); - offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength())); - // This 'if' should be removed in the next release. For now, it converts - // invalid acronyms to HOST. When removed, only the 'else' part should - // remain. - if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) - { - if (replaceInvalidAcronym) - { - typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]; - termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.' - } - else - { - typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]; - } - } - else - { - typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[tokenType]; - } - return true; - } - // When we skip a too-long term, we still increment the - // position increment - else - posIncr++; - } - } - - public override void End() - { - // set final offset - int finalOffset = CorrectOffset(scanner.Yychar() + scanner.Yylength()); - offsetAtt.SetOffset(finalOffset, finalOffset); - } - - public override void Reset(System.IO.TextReader reader) - { - base.Reset(reader); - scanner.Reset(reader); - } - - /// <summary> - /// Remove in 3.X and make true the only valid value - /// See https://issues.apache.org/jira/browse/LUCENE-1068 - /// </summary> - /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms as HOST. - /// </param> - [Obsolete("Remove in 3.X and make true the only valid value. See https://issues.apache.org/jira/browse/LUCENE-1068")] - public void SetReplaceInvalidAcronym(bool replaceInvalidAcronym) - { - this.replaceInvalidAcronym = replaceInvalidAcronym; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/Standard/StandardTokenizerImpl.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/Standard/StandardTokenizerImpl.cs b/src/Lucene.Net.Core/Analysis/Standard/StandardTokenizerImpl.cs deleted file mode 100644 index cf2a81e..0000000 --- a/src/Lucene.Net.Core/Analysis/Standard/StandardTokenizerImpl.cs +++ /dev/null @@ -1,707 +0,0 @@ -/* The following code was generated by JFlex 1.4.1 on 9/4/08 6:49 PM */ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -/* - NOTE: if you change StandardTokenizerImpl.jflex and need to regenerate the tokenizer, - the tokenizer, only use Java 1.4 !!! - This grammar currently uses constructs (eg :digit:, :letter:) whose - meaning can vary according to the JRE used to run jflex. See - https://issues.apache.org/jira/browse/LUCENE-1126 for details. - For current backwards compatibility it is needed to support - only Java 1.4 - this will change in Lucene 3.1. -*/ - -using System; -using Lucene.Net.Analysis.Tokenattributes; -using Token = Lucene.Net.Analysis.Token; - -namespace Lucene.Net.Analysis.Standard -{ - - - /// <summary> This class is a scanner generated by - /// <a href="http://www.jflex.de/">JFlex</a> 1.4.1 - /// on 9/4/08 6:49 PM from the specification file - /// <tt>/tango/mike/src/lucene.standarddigit/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt> - /// </summary> - class StandardTokenizerImpl - { - - /// <summary>This character denotes the end of file </summary> - public const int YYEOF = - 1; - - /// <summary>initial size of the lookahead buffer </summary> - private const int ZZ_BUFFERSIZE = 16384; - - /// <summary>lexical states </summary> - public const int YYINITIAL = 0; - - /// <summary> Translates characters to character classes</summary> - private const System.String ZZ_CMAP_PACKED = "\x0009\x0000\x0001\x0000\x0001\x000D\x0001\x0000\x0001\x0000\x0001\x000C\x0012\x0000\x0001\x0000\x0005\x0000\x0001\x0005" + "\x0001\x0003\x0004\x0000\x0001\x0009\x0001\x0007\x0001\x0004\x0001\x0009\x000A\x0002\x0006\x0000\x0001\x0006\x001A\x000A" + "\x0004\x0000\x0001\x0008\x0001\x0000\x001A\x000A\x002F\x0000\x0001\x000A\x000A\x0000\x0001\x000A\x0004\x0000\x0001\x000A" + "\x0005\x0000\x0017\x000A\x0001\x0000\x001F\x000A\x0001\x0000\u0128\x000A\x0002\x0000\x0012\x000A\x001C\x0000\x005E\x000A" + "\x0002\x0000\x0009\x000A\x0002\x0000\x0007\x000A\x000E\x0000\x0002\x000A\x000E\x0000\x0005\x000A\x0009\x0000\x0001\x000A" + "\x008B\x0000\x0001\x000A\x000B\x0000\x0001\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0014\x000A" + "\x0001\x0000\x002C\x000A\x0001\x0000\x0008\x000A\x0002\x0000\x001A\x000A\x000C\x0000\x0082\x000A\x000A\x0000\x0039\x000A" + "\x0002\x0000\x0002\x000A\x0002\x0000\x0002\x000A\x0003\x0000\x0026\x 000A\x0002\x0000\x0002\x000A\x0037\x0000\x0026\x000A" + "\x0002\x0000\x0001\x000A\x0007\x0000\x0027\x000A\x0048\x0000\x001B\x000A\x0005\x0000\x0003\x000A\x002E\x0000\x001A\x000A" + "\x0005\x0000\x000B\x000A\x0015\x0000\x000A\x0002\x0007\x0000\x0063\x000A\x0001\x0000\x0001\x000A\x000F\x0000\x0002\x000A" + "\x0009\x0000\x000A\x0002\x0003\x000A\x0013\x0000\x0001\x000A\x0001\x0000\x001B\x000A\x0053\x0000\x0026\x000A\u015f\x0000" + "\x0035\x000A\x0003\x0000\x0001\x000A\x0012\x0000\x0001\x000A\x0007\x0000\x000A\x000A\x0004\x0000\x000A\x0002\x0015\x0000" + "\x0008\x000A\x0002\x0000\x0002\x000A\x0002\x0000\x0016\x000A\x0001\x0000\x0007\x000A\x0001\x0000\x0001\x000A\x0003\x0000" + "\x0004\x000A\x0022\x0000\x0002\x000A\x0001\x0000\x0003\x000A\x0004\x0000\x000A\x0002\x0002\x000A\x0013\x0000\x0006\x000A" + "\x0004\x0000\x0002\x000A\x0002\x0000\x0016\x000A\x0001\x0000\x0007\x000A\x0001\x0000\x0002\x000A\x0001\x0000\x0002\x000A" + - "\x0001\x0000\x0002\x000A\x001F\x0000\x0004\x000A\x0001\x0000\x0001\x000A\x0007\x0000\x000A\x0002\x0002\x0000\x0003\x000A" + "\x0010\x0000\x0007\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0016\x000A\x0001\x0000\x0007\x000A" + "\x0001\x0000\x0002\x000A\x0001\x0000\x0005\x000A\x0003\x0000\x0001\x000A\x0012\x0000\x0001\x000A\x000F\x0000\x0001\x000A" + "\x0005\x0000\x000A\x0002\x0015\x0000\x0008\x000A\x0002\x0000\x0002\x000A\x0002\x0000\x0016\x000A\x0001\x0000\x0007\x000A" + "\x0001\x0000\x0002\x000A\x0002\x0000\x0004\x000A\x0003\x0000\x0001\x000A\x001E\x0000\x0002\x000A\x0001\x0000\x0003\x000A" + "\x0004\x0000\x000A\x0002\x0015\x0000\x0006\x000A\x0003\x0000\x0003\x000A\x0001\x0000\x0004\x000A\x0003\x0000\x0002\x000A" + "\x0001\x0000\x0001\x000A\x0001\x0000\x0002\x000A\x0003\x0000\x0002\x000A\x0003\x0000\x0003\x000A\x0003\x0000\x0008\x000A" + "\x0001\x0000\x0003\x000A\x002D\x0000\x0009\x0002\x0015\x0000\x0008\x000A\x0001\x0000\x0003\x000A\x0001\x0000\ x0017\x000A" + "\x0001\x0000\x000A\x000A\x0001\x0000\x0005\x000A\x0026\x0000\x0002\x000A\x0004\x0000\x000A\x0002\x0015\x0000\x0008\x000A" + "\x0001\x0000\x0003\x000A\x0001\x0000\x0017\x000A\x0001\x0000\x000A\x000A\x0001\x0000\x0005\x000A\x0024\x0000\x0001\x000A" + "\x0001\x0000\x0002\x000A\x0004\x0000\x000A\x0002\x0015\x0000\x0008\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0017\x000A" + "\x0001\x0000\x0010\x000A\x0026\x0000\x0002\x000A\x0004\x0000\x000A\x0002\x0015\x0000\x0012\x000A\x0003\x0000\x0018\x000A" + "\x0001\x0000\x0009\x000A\x0001\x0000\x0001\x000A\x0002\x0000\x0007\x000A\x0039\x0000\x0001\x0001\x0030\x000A\x0001\x0001" + "\x0002\x000A\x000C\x0001\x0007\x000A\x0009\x0001\x000A\x0002\x0027\x0000\x0002\x000A\x0001\x0000\x0001\x000A\x0002\x0000" + "\x0002\x000A\x0001\x0000\x0001\x000A\x0002\x0000\x0001\x000A\x0006\x0000\x0004\x000A\x0001\x0000\x0007\x000A\x0001\x0000" + "\x0003\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0001\x000A\x0002\x0000\x0002\x000A\x0001\x0000\x0004\x00 0A\x0001\x0000" + - "\x0002\x000A\x0009\x0000\x0001\x000A\x0002\x0000\x0005\x000A\x0001\x0000\x0001\x000A\x0009\x0000\x000A\x0002\x0002\x0000" + "\x0002\x000A\x0022\x0000\x0001\x000A\x001F\x0000\x000A\x0002\x0016\x0000\x0008\x000A\x0001\x0000\x0022\x000A\x001D\x0000" + "\x0004\x000A\x0074\x0000\x0022\x000A\x0001\x0000\x0005\x000A\x0001\x0000\x0002\x000A\x0015\x0000\x000A\x0002\x0006\x0000" + "\x0006\x000A\x004A\x0000\x0026\x000A\x000A\x0000\x0027\x000A\x0009\x0000\x005A\x000A\x0005\x0000\x0044\x000A\x0005\x0000" + "\x0052\x000A\x0006\x0000\x0007\x000A\x0001\x0000\x003F\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000" + "\x0007\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000\x0027\x000A\x0001\x0000\x0001\x000A\x0001\x0000" + "\x0004\x000A\x0002\x0000\x001F\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000\x0007\x000A\x0001\x0000" + "\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000\x0007\x000A\x0001\x0000\x0007\x000A\x0001\x0000\x0017\x000A\ x0001\x0000" + "\x001F\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0002\x0000\x0007\x000A\x0001\x0000\x0027\x000A\x0001\x0000" + "\x0013\x000A\x000E\x0000\x0009\x0002\x002E\x0000\x0055\x000A\x000C\x0000\u026c\x000A\x0002\x0000\x0008\x000A\x000A\x0000" + "\x001A\x000A\x0005\x0000\x004B\x000A\x0095\x0000\x0034\x000A\x002C\x0000\x000A\x0002\x0026\x0000\x000A\x0002\x0006\x0000" + "\x0058\x000A\x0008\x0000\x0029\x000A\u0557\x0000\x009C\x000A\x0004\x0000\x005A\x000A\x0006\x0000\x0016\x000A\x0002\x0000" + "\x0006\x000A\x0002\x0000\x0026\x000A\x0002\x0000\x0006\x000A\x0002\x0000\x0008\x000A\x0001\x0000\x0001\x000A\x0001\x0000" + "\x0001\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x001F\x000A\x0002\x0000\x0035\x000A\x0001\x0000\x0007\x000A\x0001\x0000" + "\x0001\x000A\x0003\x0000\x0003\x000A\x0001\x0000\x0007\x000A\x0003\x0000\x0004\x000A\x0002\x0000\x0006\x000A\x0004\x0000" + "\x000D\x000A\x0005\x0000\x0003\x000A\x0001\x0000\x0007\x000A\x0082\x0000\x0001\x000A\x0082\x0000\x0001\x00 0A\x0004\x0000" + - "\x0001\x000A\x0002\x0000\x000A\x000A\x0001\x0000\x0001\x000A\x0003\x0000\x0005\x000A\x0006\x0000\x0001\x000A\x0001\x0000" + "\x0001\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0004\x000A\x0001\x0000\x0003\x000A\x0001\x0000\x0007\x000A\u0ecb\x0000" + "\x0002\x000A\x002A\x0000\x0005\x000A\x000A\x0000\x0001\x000B\x0054\x000B\x0008\x000B\x0002\x000B\x0002\x000B\x005A\x000B" + "\x0001\x000B\x0003\x000B\x0006\x000B\x0028\x000B\x0003\x000B\x0001\x0000\x005E\x000A\x0011\x0000\x0018\x000A\x0038\x0000" + "\x0010\x000B\u0100\x0000\x0080\x000B\x0080\x0000\u19b6\x000B\x000A\x000B\x0040\x0000\u51a6\x000B\x005A\x000B\u048d\x000A" + "\u0773\x0000\u2ba4\x000A\u215c\x0000\u012e\x000B\x00D2\x000B\x0007\x000A\x000C\x0000\x0005\x000A\x0005\x0000\x0001\x000A" + "\x0001\x0000\x000A\x000A\x0001\x0000\x000D\x000A\x0001\x0000\x0005\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0002\x000A" + "\x0001\x0000\x0002\x000A\x0001\x0000\x006C\x000A\x0021\x0000\u016b\x000A\x0012\x0000\x0040\x000A\x0002\x0000\ x0036\x000A" + "\x0028\x0000\x000C\x000A\x0074\x0000\x0003\x000A\x0001\x0000\x0001\x000A\x0001\x0000\x0087\x000A\x0013\x0000\x000A\x0002" + "\x0007\x0000\x001A\x000A\x0006\x0000\x001A\x000A\x000A\x0000\x0001\x000B\x003A\x000B\x001F\x000A\x0003\x0000\x0006\x000A" + "\x0002\x0000\x0006\x000A\x0002\x0000\x0006\x000A\x0002\x0000\x0003\x000A\x0023\x0000"; - - /// <summary> Translates characters to character classes</summary> - private static readonly char[] ZZ_CMAP = ZzUnpackCMap(ZZ_CMAP_PACKED); - - /// <summary> Translates DFA states to action switch labels.</summary> - private static readonly int[] ZZ_ACTION = ZzUnpackAction(); - - private const System.String ZZ_ACTION_PACKED_0 = "\x0001\x0000\x0001\x0001\x0003\x0002\x0001\x0003\x0001\x0001\x000B\x0000\x0001\x0002\x0003\x0004" + "\x0002\x0000\x0001\x0005\x0001\x0000\x0001\x0005\x0003\x0004\x0006\x0005\x0001\x0006\x0001\x0004" + "\x0002\x0007\x0001\x0008\x0001\x0000\x0001\x0008\x0003\x0000\x0002\x0008\x0001\x0009\x0001\x000A" + "\x0001\x0004"; - - private static int[] ZzUnpackAction() - { - int[] result = new int[51]; - int offset = 0; - offset = ZzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); - return result; - } - - private static int ZzUnpackAction(System.String packed, int offset, int[] result) - { - int i = 0; /* index in packed string */ - int j = offset; /* index in unpacked array */ - int l = packed.Length; - while (i < l) - { - int count = packed[i++]; - int value_Renamed = packed[i++]; - do - result[j++] = value_Renamed; - while (--count > 0); - } - return j; - } - - - /// <summary> Translates a state to a row index in the transition table</summary> - private static readonly int[] ZZ_ROWMAP = ZzUnpackRowMap(); - - private const System.String ZZ_ROWMAP_PACKED_0 = "\x0000\x0000\x0000\x000E\x0000\x001C\x0000\x002A\x0000\x0038\x0000\x000E\x0000\x0046\x0000\x0054" + "\x0000\x0062\x0000\x0070\x0000\x007E\x0000\x008C\x0000\x009A\x0000\x00A8\x0000\x00B6\x0000\x00C4" + "\x0000\x00D2\x0000\x00E0\x0000\x00EE\x0000\x00FC\x0000\u010a\x0000\u0118\x0000\u0126\x0000\u0134" + "\x0000\u0142\x0000\u0150\x0000\u015e\x0000\u016c\x0000\u017a\x0000\u0188\x0000\u0196\x0000\u01a4" + "\x0000\u01b2\x0000\u01c0\x0000\u01ce\x0000\u01dc\x0000\u01ea\x0000\u01f8\x0000\x00D2\x0000\u0206" + "\x0000\u0214\x0000\u0222\x0000\u0230\x0000\u023e\x0000\u024c\x0000\u025a\x0000\x0054\x0000\x008C" + "\x0000\u0268\x0000\u0276\x0000\u0284"; - - private static int[] ZzUnpackRowMap() - { - int[] result = new int[51]; - int offset = 0; - offset = ZzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); - return result; - } - - private static int ZzUnpackRowMap(System.String packed, int offset, int[] result) - { - int i = 0; /* index in packed string */ - int j = offset; /* index in unpacked array */ - int l = packed.Length; - while (i < l) - { - int high = packed[i++] << 16; - result[j++] = high | packed[i++]; - } - return j; - } - - /// <summary> The transition table of the DFA</summary> - private static readonly int[] ZZ_TRANS = ZzUnpackTrans(); - - private const System.String ZZ_TRANS_PACKED_0 = "\x0001\x0002\x0001\x0003\x0001\x0004\x0007\x0002\x0001\x0005\x0001\x0006\x0001\x0007\x0001\x0002" + "\x000F\x0000\x0002\x0003\x0001\x0000\x0001\x0008\x0001\x0000\x0001\x0009\x0002\x000A\x0001\x000B" + "\x0001\x0003\x0004\x0000\x0001\x0003\x0001\x0004\x0001\x0000\x0001\x000C\x0001\x0000\x0001\x0009" + "\x0002\x000D\x0001\x000E\x0001\x0004\x0004\x0000\x0001\x0003\x0001\x0004\x0001\x000F\x0001\x0010" + "\x0001\x0011\x0001\x0012\x0002\x000A\x0001\x000B\x0001\x0013\x0010\x0000\x0001\x0002\x0001\x0000" + "\x0001\x0014\x0001\x0015\x0007\x0000\x0001\x0016\x0004\x0000\x0002\x0017\x0007\x0000\x0001\x0017" + "\x0004\x0000\x0001\x0018\x0001\x0019\x0007\x0000\x0001\x001A\x0005\x0000\x0001\x001B\x0007\x0000" + "\x0001\x000B\x0004\x0000\x0001\x001C\x0001\x001D\x0007\x0000\x0001\x001E\x0004\x0000\x0001\x001F" + "\x0001\x0020\x0007\x0000\x0001\x0021\x0004\x0000\x0001\x0022\x0001\x0023\x0007\x0000\x0001\x0024" + "\x000D\x0000\x0001\x0025\x0004\ x0000\x0001\x0014\x0001\x0015\x0007\x0000\x0001\x0026\x000D\x0000" + "\x0001\x0027\x0004\x0000\x0002\x0017\x0007\x0000\x0001\x0028\x0004\x0000\x0001\x0003\x0001\x0004" + "\x0001\x000F\x0001\x0008\x0001\x0011\x0001\x0012\x0002\x000A\x0001\x000B\x0001\x0013\x0004\x0000" + "\x0002\x0014\x0001\x0000\x0001\x0029\x0001\x0000\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0014" + "\x0004\x0000\x0001\x0014\x0001\x0015\x0001\x0000\x0001\x002B\x0001\x0000\x0001\x0009\x0002\x002C" + "\x0001\x002D\x0001\x0015\x0004\x0000\x0001\x0014\x0001\x0015\x0001\x0000\x0001\x0029\x0001\x0000" + "\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0016\x0004\x0000\x0002\x0017\x0001\x0000\x0001\x002E" + "\x0002\x0000\x0001\x002E\x0002\x0000\x0001\x0017\x0004\x0000\x0002\x0018\x0001\x0000\x0001\x002A" + "\x0001\x0000\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0018\x0004\x0000\x0001\x0018\x0001\x0019" + "\x0001\x0000\x0001\x002C\x0001\x0000\x0001\x0009\x0002\x002C\x0001\x002D\x0001\x0019\x0004\x0000" + - "\x0001\x0018\x0001\x0019\x0001\x0000\x0001\x002A\x0001\x0000\x0001\x0009\x0002\x002A\x0001\x0000" + "\x0001\x001A\x0005\x0000\x0001\x001B\x0001\x0000\x0001\x002D\x0002\x0000\x0003\x002D\x0001\x001B" + "\x0004\x0000\x0002\x001C\x0001\x0000\x0001\x002F\x0001\x0000\x0001\x0009\x0002\x000A\x0001\x000B" + "\x0001\x001C\x0004\x0000\x0001\x001C\x0001\x001D\x0001\x0000\x0001\x0030\x0001\x0000\x0001\x0009" + "\x0002\x000D\x0001\x000E\x0001\x001D\x0004\x0000\x0001\x001C\x0001\x001D\x0001\x0000\x0001\x002F" + "\x0001\x0000\x0001\x0009\x0002\x000A\x0001\x000B\x0001\x001E\x0004\x0000\x0002\x001F\x0001\x0000" + "\x0001\x000A\x0001\x0000\x0001\x0009\x0002\x000A\x0001\x000B\x0001\x001F\x0004\x0000\x0001\x001F" + "\x0001\x0020\x0001\x0000\x0001\x000D\x0001\x0000\x0001\x0009\x0002\x000D\x0001\x000E\x0001\x0020" + "\x0004\x0000\x0001\x001F\x0001\x0020\x0001\x0000\x0001\x000A\x0001\x0000\x0001\x0009\x0002\x000A" + "\x0001\x000B\x0001\x0021\x0004\x0000\x0002\x0022\x0001\x0000\x0001\x000B\x0 002\x0000\x0003\x000B" + "\x0001\x0022\x0004\x0000\x0001\x0022\x0001\x0023\x0001\x0000\x0001\x000E\x0002\x0000\x0003\x000E" + "\x0001\x0023\x0004\x0000\x0001\x0022\x0001\x0023\x0001\x0000\x0001\x000B\x0002\x0000\x0003\x000B" + "\x0001\x0024\x0006\x0000\x0001\x000F\x0006\x0000\x0001\x0025\x0004\x0000\x0001\x0014\x0001\x0015" + "\x0001\x0000\x0001\x0031\x0001\x0000\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0016\x0004\x0000" + "\x0002\x0017\x0001\x0000\x0001\x002E\x0002\x0000\x0001\x002E\x0002\x0000\x0001\x0028\x0004\x0000" + "\x0002\x0014\x0007\x0000\x0001\x0014\x0004\x0000\x0002\x0018\x0007\x0000\x0001\x0018\x0004\x0000" + "\x0002\x001C\x0007\x0000\x0001\x001C\x0004\x0000\x0002\x001F\x0007\x0000\x0001\x001F\x0004\x0000" + "\x0002\x0022\x0007\x0000\x0001\x0022\x0004\x0000\x0002\x0032\x0007\x0000\x0001\x0032\x0004\x0000" + "\x0002\x0014\x0007\x0000\x0001\x0033\x0004\x0000\x0002\x0032\x0001\x0000\x0001\x002E\x0002\x0000" + "\x0001\x002E\x0002\x0000\x0001\x0032\x0004\x0000\x0002\x0014\x 0001\x0000\x0001\x0031\x0001\x0000" + - "\x0001\x0009\x0002\x002A\x0001\x0000\x0001\x0014\x0003\x0000"; - - private static int[] ZzUnpackTrans() - { - int[] result = new int[658]; - int offset = 0; - offset = ZzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); - return result; - } - - private static int ZzUnpackTrans(System.String packed, int offset, int[] result) - { - int i = 0; /* index in packed string */ - int j = offset; /* index in unpacked array */ - int l = packed.Length; - while (i < l) - { - int count = packed[i++]; - int value_Renamed = packed[i++]; - value_Renamed--; - do - result[j++] = value_Renamed; - while (--count > 0); - } - return j; - } - - - /* error codes */ - private const int ZZ_UNKNOWN_ERROR = 0; - private const int ZZ_NO_MATCH = 1; - private const int ZZ_PUSHBACK_2BIG = 2; - - /* error messages for the codes above */ - private static readonly System.String[] ZZ_ERROR_MSG = new System.String[]{"Unkown internal scanner error", "Error: could not match input", "Error: pushback value was too large"}; - - /// <summary> ZZ_ATTRIBUTE[aState] contains the attributes of state <c>aState</c></summary> - private static readonly int[] ZZ_ATTRIBUTE = ZzUnpackAttribute(); - - private const System.String ZZ_ATTRIBUTE_PACKED_0 = "\x0001\x0000\x0001\x0009\x0003\x0001\x0001\x0009\x0001\x0001\x000B\x0000\x0004\x0001\x0002\x0000" + "\x0001\x0001\x0001\x0000\x000F\x0001\x0001\x0000\x0001\x0001\x0003\x0000\x0005\x0001"; - - private static int[] ZzUnpackAttribute() - { - int[] result = new int[51]; - int offset = 0; - offset = ZzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); - return result; - } - - private static int ZzUnpackAttribute(System.String packed, int offset, int[] result) - { - int i = 0; /* index in packed string */ - int j = offset; /* index in unpacked array */ - int l = packed.Length; - while (i < l) - { - int count = packed[i++]; - int value_Renamed = packed[i++]; - do - result[j++] = value_Renamed; - while (--count > 0); - } - return j; - } - - /// <summary>the input device </summary> - private System.IO.TextReader zzReader; - - /// <summary>the current state of the DFA </summary> - private int zzState; - - /// <summary>the current lexical state </summary> - private int zzLexicalState = YYINITIAL; - - /// <summary>this buffer contains the current text to be matched and is - /// the source of the yytext() string - /// </summary> - private char[] zzBuffer = new char[ZZ_BUFFERSIZE]; - - /// <summary>the textposition at the last accepting state </summary> - private int zzMarkedPos; - - /// <summary>the textposition at the last state to be included in yytext </summary> - private int zzPushbackPos; - - /// <summary>the current text position in the buffer </summary> - private int zzCurrentPos; - - /// <summary>startRead marks the beginning of the yytext() string in the buffer </summary> - private int zzStartRead; - - /// <summary>endRead marks the last character in the buffer, that has been read - /// from input - /// </summary> - private int zzEndRead; - - /// <summary>number of newlines encountered up to the start of the matched text </summary> - private int yyline; - - /// <summary>the number of characters up to the start of the matched text </summary> - private int yychar; - - /// <summary> the number of characters from the last newline up to the start of the - /// matched text - /// </summary> - private int yycolumn; - - /// <summary> zzAtBOL == true <=> the scanner is currently at the beginning of a line</summary> - private bool zzAtBOL = true; - - /// <summary>zzAtEOF == true <=> the scanner is at the EOF </summary> - private bool zzAtEOF; - - /* user code: */ - - public static readonly int ALPHANUM; - public static readonly int APOSTROPHE; - public static readonly int ACRONYM; - public static readonly int COMPANY; - public static readonly int EMAIL; - public static readonly int HOST; - public static readonly int NUM; - public static readonly int CJ; - /// <deprecated> this solves a bug where HOSTs that end with '.' are identified - /// as ACRONYMs. - /// </deprecated> - [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs")] - public static readonly int ACRONYM_DEP; - - public static readonly System.String[] TOKEN_TYPES; - - public int Yychar() - { - return yychar; - } - - /* - * Resets the Tokenizer to a new Reader. - */ - internal void Reset(System.IO.TextReader r) - { - // reset to default buffer size, if buffer has grown - if (zzBuffer.Length > ZZ_BUFFERSIZE) - { - zzBuffer = new char[ZZ_BUFFERSIZE]; - } - Yyreset(r); - } - - /// <summary> Fills Lucene token with the current token text.</summary> - internal void GetText(Token t) - { - t.SetTermBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); - } - - /// <summary> Fills TermAttribute with the current token text.</summary> - internal void GetText(ITermAttribute t) - { - t.SetTermBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); - } - - - /// <summary> Creates a new scanner - /// There is also a java.io.InputStream version of this constructor. - /// - /// </summary> - /// <param name="in_Renamed"> the java.io.Reader to read input from. - /// </param> - internal StandardTokenizerImpl(System.IO.TextReader in_Renamed) - { - this.zzReader = in_Renamed; - } - - /// <summary> Creates a new scanner. - /// There is also java.io.Reader version of this constructor. - /// - /// </summary> - /// <param name="in_Renamed"> the java.io.Inputstream to read input from. - /// </param> - internal StandardTokenizerImpl(System.IO.Stream in_Renamed):this(new System.IO.StreamReader(in_Renamed, System.Text.Encoding.Default)) - { - } - - /// <summary> Unpacks the compressed character translation table. - /// - /// </summary> - /// <param name="packed"> the packed character translation table - /// </param> - /// <returns> the unpacked character translation table - /// </returns> - private static char[] ZzUnpackCMap(System.String packed) - { - char[] map = new char[0x10000]; - int i = 0; /* index in packed string */ - int j = 0; /* index in unpacked array */ - while (i < 1154) - { - int count = packed[i++]; - char value_Renamed = packed[i++]; - do - map[j++] = value_Renamed; - while (--count > 0); - } - return map; - } - - - /// <summary> Refills the input buffer. - /// </summary> - /// <returns><c>false</c>, iff there was new input. - /// - /// </returns> - /// <exception cref="System.IO.IOException"> if any I/O-Error occurs - /// </exception> - private bool ZzRefill() - { - - /* first: make room (if you can) */ - if (zzStartRead > 0) - { - Array.Copy(zzBuffer, zzStartRead, zzBuffer, 0, zzEndRead - zzStartRead); - - /* translate stored positions */ - zzEndRead -= zzStartRead; - zzCurrentPos -= zzStartRead; - zzMarkedPos -= zzStartRead; - zzPushbackPos -= zzStartRead; - zzStartRead = 0; - } - - /* is the buffer big enough? */ - if (zzCurrentPos >= zzBuffer.Length) - { - /* if not: blow it up */ - char[] newBuffer = new char[zzCurrentPos * 2]; - Array.Copy(zzBuffer, 0, newBuffer, 0, zzBuffer.Length); - zzBuffer = newBuffer; - } - - /* finally: fill the buffer with new input */ - int numRead = zzReader.Read(zzBuffer, zzEndRead, zzBuffer.Length - zzEndRead); - - if (numRead <= 0) - { - return true; - } - else - { - zzEndRead += numRead; - return false; - } - } - - - /// <summary> Closes the input stream.</summary> - public void Yyclose() - { - zzAtEOF = true; /* indicate end of file */ - zzEndRead = zzStartRead; /* invalidate buffer */ - - if (zzReader != null) - zzReader.Close(); - } - - - /// <summary> Resets the scanner to read from a new input stream. - /// Does not close the old reader. - /// - /// All internal variables are reset, the old input stream - /// <b>cannot</b> be reused (internal buffer is discarded and lost). - /// Lexical state is set to <tt>ZZ_INITIAL</tt>. - /// - /// </summary> - /// <param name="reader"> the new input stream - /// </param> - public void Yyreset(System.IO.TextReader reader) - { - zzReader = reader; - zzAtBOL = true; - zzAtEOF = false; - zzEndRead = zzStartRead = 0; - zzCurrentPos = zzMarkedPos = zzPushbackPos = 0; - yyline = yychar = yycolumn = 0; - zzLexicalState = YYINITIAL; - } - - - /// <summary> Returns the current lexical state.</summary> - public int Yystate() - { - return zzLexicalState; - } - - - /// <summary> Enters a new lexical state - /// - /// </summary> - /// <param name="newState">the new lexical state - /// </param> - public void Yybegin(int newState) - { - zzLexicalState = newState; - } - - - /// <summary> Returns the text matched by the current regular expression.</summary> - public System.String Yytext() - { - return new System.String(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); - } - - - /// <summary> Returns the character at position <tt>pos</tt> from the - /// matched text. - /// - /// It is equivalent to yytext().charAt(pos), but faster - /// - /// </summary> - /// <param name="pos">the position of the character to fetch. - /// A value from 0 to yylength()-1. - /// - /// </param> - /// <returns> the character at position pos - /// </returns> - public char Yycharat(int pos) - { - return zzBuffer[zzStartRead + pos]; - } - - - /// <summary> Returns the length of the matched text region.</summary> - public int Yylength() - { - return zzMarkedPos - zzStartRead; - } - - - /// <summary> Reports an error that occured while scanning. - /// - /// In a wellformed scanner (no or only correct usage of - /// yypushback(int) and a match-all fallback rule) this method - /// will only be called with things that "Can't Possibly Happen". - /// If this method is called, something is seriously wrong - /// (e.g. a JFlex bug producing a faulty scanner etc.). - /// - /// Usual syntax/scanner level error handling should be done - /// in error fallback rules. - /// - /// </summary> - /// <param name="errorCode"> the code of the errormessage to display - /// </param> - private void ZzScanError(int errorCode) - { - System.String message; - try - { - message = ZZ_ERROR_MSG[errorCode]; - } - catch (System.IndexOutOfRangeException) - { - message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; - } - - throw new System.ApplicationException(message); - } - - - /// <summary> Pushes the specified amount of characters back into the input stream. - /// - /// They will be read again by then next call of the scanning method - /// - /// </summary> - /// <param name="number"> the number of characters to be read again. - /// This number must not be greater than yylength()! - /// </param> - public virtual void Yypushback(int number) - { - if (number > Yylength()) - ZzScanError(ZZ_PUSHBACK_2BIG); - - zzMarkedPos -= number; - } - - - /// <summary> Resumes scanning until the next regular expression is matched, - /// the end of input is encountered or an I/O-Error occurs. - /// - /// </summary> - /// <returns> the next token - /// </returns> - /// <exception cref="System.IO.IOException"> if any I/O-Error occurs - /// </exception> - public virtual int GetNextToken() - { - int zzInput; - int zzAction; - - // cached fields: - int zzCurrentPosL; - int zzMarkedPosL; - int zzEndReadL = zzEndRead; - char[] zzBufferL = zzBuffer; - char[] zzCMapL = ZZ_CMAP; - - int[] zzTransL = ZZ_TRANS; - int[] zzRowMapL = ZZ_ROWMAP; - int[] zzAttrL = ZZ_ATTRIBUTE; - - while (true) - { - zzMarkedPosL = zzMarkedPos; - - yychar += zzMarkedPosL - zzStartRead; - - zzAction = - 1; - - zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; - - zzState = zzLexicalState; - - - { - while (true) - { - - if (zzCurrentPosL < zzEndReadL) - zzInput = zzBufferL[zzCurrentPosL++]; - else if (zzAtEOF) - { - zzInput = YYEOF; - goto zzForAction_brk; // {{Aroush-2.9}} this 'goto' maybe in the wrong place - } - else - { - // store back cached positions - zzCurrentPos = zzCurrentPosL; - zzMarkedPos = zzMarkedPosL; - bool eof = ZzRefill(); - // get translated positions and possibly new buffer - zzCurrentPosL = zzCurrentPos; - zzMarkedPosL = zzMarkedPos; - zzBufferL = zzBuffer; - zzEndReadL = zzEndRead; - if (eof) - { - zzInput = YYEOF; - goto zzForAction_brk; // {{Aroush-2.9}} this 'goto' maybe in the wrong place - } - else - { - zzInput = zzBufferL[zzCurrentPosL++]; - } - } - int zzNext = zzTransL[zzRowMapL[zzState] + zzCMapL[zzInput]]; - if (zzNext == - 1) - { - goto zzForAction_brk; // {{Aroush-2.9}} this 'goto' maybe in the wrong place - } - zzState = zzNext; - - int zzAttributes = zzAttrL[zzState]; - if ((zzAttributes & 1) == 1) - { - zzAction = zzState; - zzMarkedPosL = zzCurrentPosL; - if ((zzAttributes & 8) == 8) - { - goto zzForAction_brk; // {{Aroush-2.9}} this 'goto' maybe in the wrong place - } - } - } - } - -zzForAction_brk: ; // {{Aroush-2.9}} this 'lable' maybe in the wrong place - - - // store back cached position - zzMarkedPos = zzMarkedPosL; - - switch (zzAction < 0?zzAction:ZZ_ACTION[zzAction]) - { - - case 4: - { - return HOST; - } - - case 11: break; - - case 9: - { - return ACRONYM; - } - - case 12: break; - - case 8: - { - return ACRONYM_DEP; - } - - case 13: break; - - case 1: - { - /* ignore */ - } - goto case 14; - - case 14: break; - - case 5: - { - return NUM; - } - - case 15: break; - - case 3: - { - return CJ; - } - - case 16: break; - - case 2: - { - return ALPHANUM; - } - - case 17: break; - - case 7: - { - return COMPANY; - } - - case 18: break; - - case 6: - { - return APOSTROPHE; - } - - case 19: break; - - case 10: - { - return EMAIL; - } - - case 20: break; - - default: - if (zzInput == YYEOF && zzStartRead == zzCurrentPos) - { - zzAtEOF = true; - return YYEOF; - } - else - { - ZzScanError(ZZ_NO_MATCH); - } - break; - - } - } - } - static StandardTokenizerImpl() - { - ALPHANUM = StandardTokenizer.ALPHANUM; - APOSTROPHE = StandardTokenizer.APOSTROPHE; - ACRONYM = StandardTokenizer.ACRONYM; - COMPANY = StandardTokenizer.COMPANY; - EMAIL = StandardTokenizer.EMAIL; - HOST = StandardTokenizer.HOST; - NUM = StandardTokenizer.NUM; - CJ = StandardTokenizer.CJ; - ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP; - TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES; - } - } -} \ No newline at end of file
