http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab69b431/src/Lucene.Net.Analysis.Common/Analysis/Standard/Std40/UAX29URLEmailTokenizerImpl40.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/Std40/UAX29URLEmailTokenizerImpl40.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/Std40/UAX29URLEmailTokenizerImpl40.cs index e3d58e5..3593baa 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/Std40/UAX29URLEmailTokenizerImpl40.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/Std40/UAX29URLEmailTokenizerImpl40.cs @@ -22,32 +22,32 @@ namespace Lucene.Net.Analysis.Standard.Std40 */ /// <summary> - /// This class implements UAX29URLEmailTokenizer using Unicode 6.1.0. + /// This class implements <see cref="UAX29URLEmailTokenizer"/> using Unicode 6.1.0. /// @deprecated This class is only for exact backwards compatibility /// </summary> [Obsolete("This class is only for exact backwards compatibility")] public sealed class UAX29URLEmailTokenizerImpl40 : IStandardTokenizerInterface { - /** This character denotes the end of file */ + /// <summary>This character denotes the end of file</summary> public static readonly int YYEOF = -1; - /** initial size of the lookahead buffer */ + /// <summary>initial size of the lookahead buffer</summary> private static readonly int ZZ_BUFFERSIZE = 4096; - /** lexical states */ + /// <summary>lexical states</summary> public const int YYINITIAL = 0; - /** - * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l - * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l - * at the beginning of a line - * l is of the form l = 2*k, k a non negative integer - */ + /// <summary> + /// ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + /// ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + /// at the beginning of a line + /// l is of the form l = 2*k, k a non negative integer + /// </summary> private static readonly int[] ZZ_LEXSTATE = { 0, 0 }; - /** - * Translates characters to character classes - */ + /// <summary> + /// Translates characters to character classes + /// </summary> private const string ZZ_CMAP_PACKED = "\x0001\x00C1\x0008\x00BF\x0002\x00C1\x0002\x00BF\x0001\x00C1\x0013\x00BF\x0001\x00C2\x0001\x00BE\x0001\x00B9\x0001\x00C2" + "\x0001\x00B2\x0001\x00B0\x0001\x00B5\x0002\x00B3\x0002\x00C2\x0001\x00B4\x0001\x00A4\x0001\x0089\x0001\x00B8\x0001\x00A5" + @@ -201,14 +201,14 @@ namespace Lucene.Net.Analysis.Standard.Std40 "\x000B\x0000\x0038\x007F\x0002\x007D\x001F\x0088\x0003\x0000\x0006\x0088\x0002\x0000\x0006\x0088\x0002\x0000\x0006\x0088" + "\x0002\x0000\x0003\x0088\x001C\x0000\x0003\x007D\x0004\x0000"; - /** - * Translates characters to character classes - */ + /// <summary> + /// Translates characters to character classes + /// </summary> private static readonly char[] ZZ_CMAP = ZzUnpackCMap(ZZ_CMAP_PACKED); - /** - * Translates DFA states to action switch labels. - */ + /// <summary> + /// Translates DFA states to action switch labels. + /// </summary> private static readonly int[] ZZ_ACTION = ZzUnpackAction(); private const string ZZ_ACTION_PACKED_0 = @@ -255,9 +255,9 @@ namespace Lucene.Net.Analysis.Standard.Std40 } - /** - * Translates a state to a row index in the transition table - */ + /// <summary> + /// Translates a state to a row index in the transition table + /// </summary> private static readonly int[] ZZ_ROWMAP = ZzUnpackRowMap(); private const string ZZ_ROWMAP_PACKED_0 = @@ -502,9 +502,9 @@ namespace Lucene.Net.Analysis.Standard.Std40 return j; } - /** - * The transition table of the DFA - */ + /// <summary> + /// The transition table of the DFA + /// </summary> private static readonly int[] ZZ_TRANS = ZzUnpackTrans(); private const string ZZ_TRANS_PACKED_0 = @@ -3906,9 +3906,9 @@ namespace Lucene.Net.Analysis.Standard.Std40 "Error: pushback value was too large" }; - /** - * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code> - */ + /// <summary> + /// ZZ_ATTRIBUTE[aState] contains the attributes of state <c>aState</c> + /// </summary> private static readonly int[] ZZ_ATTRIBUTE = ZzUnpackAttribute(); private const string ZZ_ATTRIBUTE_PACKED_0 = @@ -3951,73 +3951,77 @@ namespace Lucene.Net.Analysis.Standard.Std40 return j; } - /** the input device */ + /// <summary>the input device</summary> private TextReader zzReader; - /** the current state of the DFA */ + /// <summary>the current state of the DFA</summary> private int zzState; - /** the current lexical state */ + /// <summary>the current lexical state</summary> private int zzLexicalState = YYINITIAL; - /** this buffer contains the current text to be matched and is - the source of the YyText() string */ + /// <summary> + /// this buffer contains the current text to be matched and is + /// the source of the YyText string + /// </summary> private char[] zzBuffer = new char[ZZ_BUFFERSIZE]; - /** the textposition at the last accepting state */ + /// <summary>the textposition at the last accepting state</summary> private int zzMarkedPos; - /** the current text position in the buffer */ + /// <summary>the current text position in the buffer</summary> private int zzCurrentPos; - /** startRead marks the beginning of the YyText() string in the buffer */ + /// <summary>startRead marks the beginning of the YyText string in the buffer</summary> private int zzStartRead; - /** endRead marks the last character in the buffer, that has been read - from input */ + /// <summary> + /// endRead marks the last character in the buffer, that has been read + /// from input + /// </summary> private int zzEndRead; - /** number of newlines encountered up to the start of the matched text */ + /// <summary>number of newlines encountered up to the start of the matched text</summary> private int yyline; - /** the number of characters up to the start of the matched text */ + /// <summary>the number of characters up to the start of the matched text</summary> private int yychar; #pragma warning disable 169, 414 - /** - * the number of characters from the last newline up to the start of the - * matched text - */ + /// <summary> + /// the number of characters from the last newline up to the start of the + /// matched text + /// </summary> private int yycolumn; - /** - * zzAtBOL == true <=> the scanner is currently at the beginning of a line - */ + /// <summary> + /// zzAtBOL == true <=> the scanner is currently at the beginning of a line + /// </summary> private bool zzAtBOL = true; - /** zzAtEOF == true <=> the scanner is at the EOF */ + /// <summary>zzAtEOF == true <=> the scanner is at the EOF</summary> private bool zzAtEOF; - /** denotes if the user-EOF-code has already been executed */ + /// <summary>denotes if the user-EOF-code has already been executed</summary> private bool zzEOFDone; #pragma warning restore 169, 414 /* user code: */ - /** Alphanumeric sequences */ + /// <summary>Alphanumeric sequences</summary> public static readonly int WORD_TYPE = UAX29URLEmailTokenizer.ALPHANUM; - /** Numbers */ + /// <summary>Numbers</summary> public static readonly int NUMERIC_TYPE = UAX29URLEmailTokenizer.NUM; - /** - * Chars in class \p{Line_Break = Complex_Context} are from South East Asian - * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept - * together as as a single token rather than broken up, because the logic - * required to break them at word boundaries is too complex for UAX#29. - * <p> - * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA - */ + /// <summary> + /// Chars in class \p{Line_Break = Complex_Context} are from South East Asian + /// scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept + /// together as as a single token rather than broken up, because the logic + /// required to break them at word boundaries is too complex for UAX#29. + /// <para/> + /// See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA + /// </summary> public static readonly int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN; public static readonly int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC; @@ -4037,32 +4041,30 @@ namespace Lucene.Net.Analysis.Standard.Std40 get { return yychar; } } - /** - * Fills CharTermAttribute with the current token text. - */ + /// <summary> + /// Fills ICharTermAttribute with the current token text. + /// </summary> public void GetText(ICharTermAttribute t) { t.CopyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); } - /** - * Creates a new scanner - * - * @param in the TextReader to read input from. - */ + /// <summary> + /// Creates a new scanner + /// </summary> + /// <param name="in">the TextReader to read input from.</param> public UAX29URLEmailTokenizerImpl40(TextReader @in) { this.zzReader = @in; } - /** - * Unpacks the compressed character translation table. - * - * @param packed the packed character translation table - * @return the unpacked character translation table - */ + /// <summary> + /// Unpacks the compressed character translation table. + /// </summary> + /// <param name="packed">the packed character translation table</param> + /// <returns>the unpacked character translation table</returns> private static char[] ZzUnpackCMap(string packed) { char[] map = new char[0x10000]; @@ -4078,13 +4080,11 @@ namespace Lucene.Net.Analysis.Standard.Std40 } - /** - * Refills the input buffer. - * - * @return <code>false</code>, iff there was new input. - * - * @exception java.io.IOException if any I/O-Error occurs - */ + /// <summary> + /// Refills the input buffer. + /// </summary> + /// <returns><c>false</c>, iff there was new input.</returns> + /// <exception cref="IOException">if any I/O-Error occurs</exception> private bool ZzRefill() { @@ -4140,9 +4140,9 @@ namespace Lucene.Net.Analysis.Standard.Std40 } - /** - * Closes the input stream. - */ + /// <summary> + /// Disposes the input stream. + /// </summary> public void YyClose() { zzAtEOF = true; /* indicate end of file */ @@ -4155,18 +4155,17 @@ namespace Lucene.Net.Analysis.Standard.Std40 } - /** - * Resets the scanner to read from a new input stream. - * Does not close the old reader. - * - * All internal variables are reset, the old input stream - * <b>cannot</b> be reused (internal buffer is discarded and lost). - * Lexical state is set to <tt>ZZ_INITIAL</tt>. - * - * Internal scan buffer is resized down to its initial length, if it has grown. - * - * @param reader the new input stream - */ + /// <summary> + /// Resets the scanner to read from a new input stream. + /// Does not close the old reader. + /// <para/> + /// All internal variables are reset, the old input stream + /// <b>cannot</b> be reused (internal buffer is discarded and lost). + /// Lexical state is set to <see cref="YYINITIAL"/>. + /// <para/> + /// Internal scan buffer is resized down to its initial length, if it has grown. + /// </summary> + /// <param name="reader">the new input stream </param> public void YyReset(TextReader reader) { zzReader = reader; @@ -4182,75 +4181,73 @@ namespace Lucene.Net.Analysis.Standard.Std40 } - /** - * Returns the current lexical state. - */ + /// <summary> + /// Returns the current lexical state. + /// </summary> public int YyState { get { return zzLexicalState; } } - /** - * Enters a new lexical state - * - * @param newState the new lexical state - */ + /// <summary> + /// Enters a new lexical state + /// </summary> + /// <param name="newState">the new lexical state</param> public void YyBegin(int newState) { zzLexicalState = newState; } - /** - * Returns the text matched by the current regular expression. - */ + /// <summary> + /// Returns the text matched by the current regular expression. + /// </summary> public string YyText { get { return new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); } } - /** - * Returns the character at position <tt>pos</tt> from the - * matched text. - * - * It is equivalent to YyText().charAt(pos), but faster - * - * @param pos the position of the character to fetch. - * A value from 0 to YyLength()-1. - * - * @return the character at position pos - */ + /// <summary> + /// Returns the character at position <paramref name="pos"/> from the + /// matched text. + /// <para/> + /// It is equivalent to YyText[pos], but faster + /// </summary> + /// <param name="pos"> + /// the position of the character to fetch. + /// A value from 0 to YyLength-1. + /// </param> + /// <returns>the character at position pos</returns> public char YyCharAt(int pos) { return zzBuffer[zzStartRead + pos]; } - /** - * Returns the length of the matched text region. - */ + /// <summary> + /// Returns the length of the matched text region. + /// </summary> public int YyLength { get { return zzMarkedPos - zzStartRead; } } - /** - * Reports an error that occured while scanning. - * - * In a wellformed scanner (no or only correct usage of - * YyPushBack(int) and a match-all fallback rule) this method - * will only be called with things that "Can't Possibly Happen". - * If this method is called, something is seriously wrong - * (e.g. a JFlex bug producing a faulty scanner etc.). - * - * Usual syntax/scanner level error handling should be done - * in error fallback rules. - * - * @param errorCode the code of the errormessage to display - */ + /// <summary> + /// Reports an error that occured while scanning. + /// <para/> + /// In a wellformed scanner (no or only correct usage of + /// YyPushBack(int) and a match-all fallback rule) this method + /// will only be called with things that "Can't Possibly Happen". + /// If this method is called, something is seriously wrong + /// (e.g. a JFlex bug producing a faulty scanner etc.). + /// <para/> + /// Usual syntax/scanner level error handling should be done + /// in error fallback rules. + /// </summary> + /// <param name="errorCode">the code of the errormessage to display</param> private void ZzScanError(int errorCode) { string message; @@ -4267,14 +4264,15 @@ namespace Lucene.Net.Analysis.Standard.Std40 } - /** - * Pushes the specified amount of characters back into the input stream. - * - * They will be read again by then next call of the scanning method - * - * @param number the number of characters to be read again. - * This number must not be greater than YyLength()! - */ + /// <summary> + /// Pushes the specified amount of characters back into the input stream. + /// <para/> + /// They will be read again by then next call of the scanning method + /// </summary> + /// <param name="number"> + /// the number of characters to be read again. + /// This number must not be greater than YyLength! + /// </param> public void YyPushBack(int number) { if (number > YyLength) @@ -4284,13 +4282,12 @@ namespace Lucene.Net.Analysis.Standard.Std40 } - /** - * Resumes scanning until the next regular expression is matched, - * the end of input is encountered or an I/O-Error occurs. - * - * @return the next token - * @exception java.io.IOException if any I/O-Error occurs - */ + /// <summary> + /// Resumes scanning until the next regular expression is matched, + /// the end of input is encountered or an I/O-Error occurs. + /// </summary> + /// <returns>the next token</returns> + /// <exception cref="IOException">if any I/O-Error occurs</exception> public int GetNextToken() { int zzInput;
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab69b431/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailAnalyzer.cs index 502b98c..65aecc2 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailAnalyzer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailAnalyzer.cs @@ -23,21 +23,19 @@ namespace Lucene.Net.Analysis.Standard */ /// <summary> - /// Filters <see cref="org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer"/> + /// Filters <see cref="UAX29URLEmailTokenizer"/> /// with <see cref="StandardFilter"/>, /// <see cref="LowerCaseFilter"/> and /// <see cref="StopFilter"/>, using a list of /// English stop words. /// - /// <a name="version"/> /// <para> - /// You must specify the required <see cref="org.apache.lucene.util.Version"/> - /// compatibility when creating UAX29URLEmailAnalyzer + /// You must specify the required <see cref="LuceneVersion"/> + /// compatibility when creating <see cref="UAX29URLEmailAnalyzer"/> /// </para> /// </summary> public sealed class UAX29URLEmailAnalyzer : StopwordAnalyzerBase { - /// <summary> /// Default maximum allowed token length </summary> public const int DEFAULT_MAX_TOKEN_LENGTH = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; @@ -52,8 +50,7 @@ namespace Lucene.Net.Analysis.Standard /// <summary> /// Builds an analyzer with the given stop words. </summary> - /// <param name="matchVersion"> Lucene version to match See {@link - /// <a href="#version">above</a>} </param> + /// <param name="matchVersion"> Lucene version to match - See <see cref="UAX29URLEmailAnalyzer"/> </param> /// <param name="stopWords"> stop words </param> public UAX29URLEmailAnalyzer(LuceneVersion matchVersion, CharArraySet stopWords) : base(matchVersion, stopWords) @@ -61,10 +58,9 @@ namespace Lucene.Net.Analysis.Standard } /// <summary> - /// Builds an analyzer with the default stop words ({@link - /// #STOP_WORDS_SET}). </summary> - /// <param name="matchVersion"> Lucene version to match See {@link - /// <a href="#version">above</a>} </param> + /// Builds an analyzer with the default stop words (<see cref="STOP_WORDS_SET"/>. + /// </summary> + /// <param name="matchVersion"> Lucene version to match - See <see cref="UAX29URLEmailAnalyzer"/> </param> public UAX29URLEmailAnalyzer(LuceneVersion matchVersion) : this(matchVersion, STOP_WORDS_SET) { @@ -72,10 +68,9 @@ namespace Lucene.Net.Analysis.Standard /// <summary> /// Builds an analyzer with the stop words from the given reader. </summary> - /// <seealso cref= org.apache.lucene.analysis.util.WordlistLoader#getWordSet(java.io.Reader, org.apache.lucene.util.Version) </seealso> - /// <param name="matchVersion"> Lucene version to match See {@link - /// <a href="#version">above</a>} </param> - /// <param name="stopwords"> TextReader to read stop words from </param> + /// <seealso cref="WordlistLoader.GetWordSet(TextReader, LuceneVersion)"/> + /// <param name="matchVersion"> Lucene version to match - See <see cref="UAX29URLEmailAnalyzer"/> </param> + /// <param name="stopwords"> <see cref="TextReader"/> to read stop words from </param> public UAX29URLEmailAnalyzer(LuceneVersion matchVersion, TextReader stopwords) : this(matchVersion, LoadStopwordSet(stopwords, matchVersion)) { @@ -93,7 +88,6 @@ namespace Lucene.Net.Analysis.Standard get { return maxTokenLength; } } - protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { UAX29URLEmailTokenizer src = new UAX29URLEmailTokenizer(m_matchVersion, reader); http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab69b431/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizer.cs index 2c91236..83659e2 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizer.cs @@ -1,11 +1,10 @@ -using Lucene.Net.Analysis.Standard; -using Lucene.Net.Analysis.TokenAttributes; -using Lucene.Net.Util; -using System.IO; -using Lucene.Net.Analysis.Standard.Std31; +using Lucene.Net.Analysis.Standard.Std31; using Lucene.Net.Analysis.Standard.Std34; using Lucene.Net.Analysis.Standard.Std36; using Lucene.Net.Analysis.Standard.Std40; +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Util; +using System.IO; namespace Lucene.Net.Analysis.Standard { @@ -31,26 +30,25 @@ namespace Lucene.Net.Analysis.Standard /// algorithm, as specified in ` /// <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> /// URLs and email addresses are also tokenized according to the relevant RFCs. - /// <p/> + /// <para/> /// Tokens produced are of the following types: - /// <ul> - /// <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li> - /// <li><NUM>: A number</li> - /// <li><URL>: A URL</li> - /// <li><EMAIL>: An email address</li> - /// <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast - /// Asian languages, including Thai, Lao, Myanmar, and Khmer</li> - /// <li><IDEOGRAPHIC>: A single CJKV ideographic character</li> - /// <li><HIRAGANA>: A single hiragana character</li> - /// </ul> - /// <a name="version"/> + /// <list type="bullet"> + /// <item><ALPHANUM>: A sequence of alphabetic and numeric characters</item> + /// <item><NUM>: A number</item> + /// <item><URL>: A URL</item> + /// <item><EMAIL>: An email address</item> + /// <item><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast + /// Asian languages, including Thai, Lao, Myanmar, and Khmer</item> + /// <item><IDEOGRAPHIC>: A single CJKV ideographic character</item> + /// <item><HIRAGANA>: A single hiragana character</item> + /// </list> /// <para>You must specify the required <see cref="LuceneVersion"/> - /// compatibility when creating UAX29URLEmailTokenizer: - /// <ul> - /// <li> As of 3.4, Hiragana and Han characters are no longer wrongly split - /// from their combining characters. If you use a previous version number, - /// you get the exact broken behavior for backwards compatibility. - /// </ul> + /// compatibility when creating <see cref="UAX29URLEmailTokenizer"/>: + /// <list type="bullet"> + /// <item> As of 3.4, Hiragana and Han characters are no longer wrongly split + /// from their combining characters. If you use a previous version number, + /// you get the exact broken behavior for backwards compatibility.</item> + /// </list> /// </para> /// </summary> public sealed class UAX29URLEmailTokenizer : Tokenizer @@ -71,7 +69,17 @@ namespace Lucene.Net.Analysis.Standard /// <summary> /// String token types that correspond to token type int constants </summary> - public static readonly string[] TOKEN_TYPES = new string[] { StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM], StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM], StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN], StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC], StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA], StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA], StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL], "<URL>", "<EMAIL>" }; + public static readonly string[] TOKEN_TYPES = new string[] { + StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM], + StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM], + StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN], + StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC], + StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA], + StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA], + StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL], + "<URL>", + "<EMAIL>" + }; private int skippedPositions; @@ -99,9 +107,10 @@ namespace Lucene.Net.Analysis.Standard /// <summary> - /// Creates a new instance of the UAX29URLEmailTokenizer. Attaches - /// the <code>input</code> to the newly created JFlex scanner. + /// Creates a new instance of the <see cref="UAX29URLEmailTokenizer"/>. Attaches + /// the <paramref name="input"/> to the newly created JFlex scanner. /// </summary> + /// <param name="matchVersion"> Lucene compatibility version </param> /// <param name="input"> The input reader </param> public UAX29URLEmailTokenizer(LuceneVersion matchVersion, TextReader input) : base(input) @@ -110,7 +119,7 @@ namespace Lucene.Net.Analysis.Standard } /// <summary> - /// Creates a new UAX29URLEmailTokenizer with a given <see cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> + /// Creates a new <see cref="UAX29URLEmailTokenizer"/> with a given <see cref="AttributeSource.AttributeFactory"/> /// </summary> public UAX29URLEmailTokenizer(LuceneVersion matchVersion, AttributeFactory factory, TextReader input) : base(factory, input) @@ -119,7 +128,7 @@ namespace Lucene.Net.Analysis.Standard } /// <summary> - /// LUCENENET: This method was added in .NET to prevent having to repeat code in the constructors. + /// LUCENENET specific: This method was added in .NET to prevent having to repeat code in the constructors. /// </summary> /// <param name="matchVersion"></param> private void Init(LuceneVersion matchVersion) @@ -165,7 +174,7 @@ namespace Lucene.Net.Analysis.Standard private IPositionIncrementAttribute posIncrAtt; private ITypeAttribute typeAtt; - public override bool IncrementToken() + public override sealed bool IncrementToken() { ClearAttributes(); skippedPositions = 0; @@ -197,7 +206,7 @@ namespace Lucene.Net.Analysis.Standard } } - public override void End() + public override sealed void End() { base.End(); // set final offset http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab69b431/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizerFactory.cs index dc902f8..976f4c5 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizerFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizerFactory.cs @@ -1,8 +1,7 @@ -using Lucene.Net.Util; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Util; using System.Collections.Generic; using System.IO; -using TokenizerFactory = Lucene.Net.Analysis.Util.TokenizerFactory; -using System; namespace Lucene.Net.Analysis.Standard { @@ -37,7 +36,7 @@ namespace Lucene.Net.Analysis.Standard private readonly int maxTokenLength; /// <summary> - /// Creates a new UAX29URLEmailTokenizerFactory </summary> + /// Creates a new <see cref="UAX29URLEmailTokenizerFactory"/> </summary> public UAX29URLEmailTokenizerFactory(IDictionary<string, string> args) : base(args) { http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab69b431/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizerImpl.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizerImpl.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizerImpl.cs index b45186e..dbf05a7 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizerImpl.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailTokenizerImpl.cs @@ -26,44 +26,44 @@ namespace Lucene.Net.Analysis.Standard /// algorithm, as specified in /// <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> /// URLs and email addresses are also tokenized according to the relevant RFCs. - /// <p/> + /// <para/> /// Tokens produced are of the following types: - /// <ul> - /// <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li> - /// <li><NUM>: A number</li> - /// <li><URL>: A URL</li> - /// <li><EMAIL>: An email address</li> - /// <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast - /// Asian languages, including Thai, Lao, Myanmar, and Khmer</li> - /// <li><IDEOGRAPHIC>: A single CJKV ideographic character</li> - /// <li><HIRAGANA>: A single hiragana character</li> - /// <li><KATAKANA>: A sequence of katakana characters</li> - /// <li><HANGUL>: A sequence of Hangul characters</li> - /// </ul> + /// <list type="bullet"> + /// <item><ALPHANUM>: A sequence of alphabetic and numeric characters</item> + /// <item><NUM>: A number</item> + /// <item><URL>: A URL</item> + /// <item><EMAIL>: An email address</item> + /// <item><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast + /// Asian languages, including Thai, Lao, Myanmar, and Khmer</item> + /// <item><IDEOGRAPHIC>: A single CJKV ideographic character</item> + /// <item><HIRAGANA>: A single hiragana character</item> + /// <item><KATAKANA>: A sequence of katakana characters</item> + /// <item><HANGUL>: A sequence of Hangul characters</item> + /// </list> /// </summary> public sealed class UAX29URLEmailTokenizerImpl : IStandardTokenizerInterface { - /** This character denotes the end of file */ + /// <summary>This character denotes the end of file</summary> public static readonly int YYEOF = -1; - /** initial size of the lookahead buffer */ + /// <summary>initial size of the lookahead buffer</summary> private static readonly int ZZ_BUFFERSIZE = 4096; - /** lexical states */ + /// <summary>lexical states</summary> public const int YYINITIAL = 0; public const int AVOID_BAD_URL = 2; - /** - * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l - * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l - * at the beginning of a line - * l is of the form l = 2*k, k a non negative integer - */ + /// <summary> + /// ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + /// ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + /// at the beginning of a line + /// l is of the form l = 2*k, k a non negative integer + /// </summary> private static readonly int[] ZZ_LEXSTATE = { 0, 0, 1, 1 }; - /** - * Translates characters to character classes - */ + /// <summary> + /// Translates characters to character classes + /// </summary> private const string ZZ_CMAP_PACKED = "\x0001\x00C6\x0008\x00C4\x0002\x00C6\x0002\x00C4\x0001\x00C6\x0013\x00C4\x0001\x00C7\x0001\x008D\x0001\x00BF\x0001\x00C7" + "\x0001\x00B9\x0001\x00B7\x0001\x008C\x0002\x00BA\x0002\x00C7\x0001\x00BB\x0001\x00AB\x0001\x0090\x0001\x00BE\x0001\x00AD" + @@ -219,14 +219,14 @@ namespace Lucene.Net.Analysis.Standard "\x0002\x0000\x0003\x008F\x001C\x0000\x0003\x007F\x0004\x0000"; - /** - * Translates characters to character classes - */ + /// <summary> + /// Translates characters to character classes + /// </summary> private static readonly char[] ZZ_CMAP = ZzUnpackCMap(ZZ_CMAP_PACKED); - /** - * Translates DFA states to action switch labels. - */ + /// <summary> + /// Translates DFA states to action switch labels. + /// </summary> private static readonly int[] ZZ_ACTION = ZzUnpackAction(); private const string ZZ_ACTION_PACKED_0 = @@ -292,9 +292,9 @@ namespace Lucene.Net.Analysis.Standard } - /** - * Translates a state to a row index in the transition table - */ + /// <summary> + /// Translates a state to a row index in the transition table + /// </summary> private static readonly int[] ZZ_ROWMAP = ZzUnpackRowMap(); private const string ZZ_ROWMAP_PACKED_0 = @@ -710,9 +710,9 @@ namespace Lucene.Net.Analysis.Standard return j; } - /** - * The transition table of the DFA - */ + /// <summary> + /// The transition table of the DFA + /// </summary> private static readonly int[] ZZ_TRANS = ZzUnpackTrans(); private const string ZZ_TRANS_PACKED_0 = @@ -8998,9 +8998,9 @@ namespace Lucene.Net.Analysis.Standard "Error: pushback value was too large" }; - /** - * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code> - */ + /// <summary> + /// ZZ_ATTRIBUTE[aState] contains the attributes of state <c>aState</c> + /// </summary> private static readonly int[] ZZ_ATTRIBUTE = ZzUnpackAttribute(); private const string ZZ_ATTRIBUTE_PACKED_0 = @@ -9056,73 +9056,77 @@ namespace Lucene.Net.Analysis.Standard return j; } - /** the input device */ + /// <summary>the input device</summary> private TextReader zzReader; - /** the current state of the DFA */ + /// <summary>the current state of the DFA</summary> private int zzState; - /** the current lexical state */ + /// <summary>the current lexical state</summary> private int zzLexicalState = YYINITIAL; - /** this buffer contains the current text to be matched and is - the source of the YyText() string */ + /// <summary> + /// this buffer contains the current text to be matched and is + /// the source of the YyText string + /// </summary> private char[] zzBuffer = new char[ZZ_BUFFERSIZE]; - /** the textposition at the last accepting state */ + /// <summary>the textposition at the last accepting state</summary> private int zzMarkedPos; - /** the current text position in the buffer */ + /// <summary>the current text position in the buffer</summary> private int zzCurrentPos; - /** startRead marks the beginning of the YyText() string in the buffer */ + /// <summary>startRead marks the beginning of the YyText string in the buffer</summary> private int zzStartRead; - /** endRead marks the last character in the buffer, that has been read - from input */ + /// <summary> + /// endRead marks the last character in the buffer, that has been read + /// from input + /// </summary> private int zzEndRead; - /** number of newlines encountered up to the start of the matched text */ + /// <summary>number of newlines encountered up to the start of the matched text</summary> private int yyline; - /** the number of characters up to the start of the matched text */ + /// <summary>the number of characters up to the start of the matched text</summary> private int yychar; #pragma warning disable 169, 414 - /** - * the number of characters from the last newline up to the start of the - * matched text - */ + /// <summary> + /// the number of characters from the last newline up to the start of the + /// matched text + /// </summary> private int yycolumn; - /** - * zzAtBOL == true <=> the scanner is currently at the beginning of a line - */ + /// <summary> + /// zzAtBOL == true <=> the scanner is currently at the beginning of a line + /// </summary> private bool zzAtBOL = true; - /** zzAtEOF == true <=> the scanner is at the EOF */ + /// <summary>zzAtEOF == true <=> the scanner is at the EOF</summary> private bool zzAtEOF; - /** denotes if the user-EOF-code has already been executed */ + /// <summary>denotes if the user-EOF-code has already been executed</summary> private bool zzEOFDone; #pragma warning restore 169, 414 /* user code: */ - /** Alphanumeric sequences */ + /// <summary>Alphanumeric sequences</summary> public static readonly int WORD_TYPE = UAX29URLEmailTokenizer.ALPHANUM; - /** Numbers */ + /// <summary>Numbers</summary> public static readonly int NUMERIC_TYPE = UAX29URLEmailTokenizer.NUM; - /** - * Chars in class \p{Line_Break = Complex_Context} are from South East Asian - * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept - * together as as a single token rather than broken up, because the logic - * required to break them at word boundaries is too complex for UAX#29. - * <p> - * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA - */ + /// <summary> + /// Chars in class \p{Line_Break = Complex_Context} are from South East Asian + /// scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept + /// together as as a single token rather than broken up, because the logic + /// required to break them at word boundaries is too complex for UAX#29. + /// <para/> + /// See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA + /// </summary> public static readonly int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN; public static readonly int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC; @@ -9142,32 +9146,30 @@ namespace Lucene.Net.Analysis.Standard get { return yychar; } } - /** - * Fills CharTermAttribute with the current token text. - */ + /// <summary> + /// Fills ICharTermAttribute with the current token text. + /// </summary> public void GetText(ICharTermAttribute t) { t.CopyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); } - /** - * Creates a new scanner - * - * @param in the TextReader to read input from. - */ + /// <summary> + /// Creates a new scanner + /// </summary> + /// <param name="in">the TextReader to read input from.</param> public UAX29URLEmailTokenizerImpl(TextReader @in) { this.zzReader = @in; } - /** - * Unpacks the compressed character translation table. - * - * @param packed the packed character translation table - * @return the unpacked character translation table - */ + /// <summary> + /// Unpacks the compressed character translation table. + /// </summary> + /// <param name="packed">the packed character translation table</param> + /// <returns>the unpacked character translation table</returns> private static char[] ZzUnpackCMap(string packed) { char[] map = new char[0x10000]; @@ -9183,13 +9185,11 @@ namespace Lucene.Net.Analysis.Standard } - /** - * Refills the input buffer. - * - * @return <code>false</code>, iff there was new input. - * - * @exception java.io.IOException if any I/O-Error occurs - */ + /// <summary> + /// Refills the input buffer. + /// </summary> + /// <returns><c>false</c>, iff there was new input.</returns> + /// <exception cref="IOException">if any I/O-Error occurs</exception> private bool ZzRefill() { @@ -9245,9 +9245,9 @@ namespace Lucene.Net.Analysis.Standard } - /** - * Closes the input stream. - */ + /// <summary> + /// Disposes the input stream. + /// </summary> public void YyClose() { zzAtEOF = true; /* indicate end of file */ @@ -9260,18 +9260,17 @@ namespace Lucene.Net.Analysis.Standard } - /** - * Resets the scanner to read from a new input stream. - * Does not close the old reader. - * - * All internal variables are reset, the old input stream - * <b>cannot</b> be reused (internal buffer is discarded and lost). - * Lexical state is set to <tt>ZZ_INITIAL</tt>. - * - * Internal scan buffer is resized down to its initial length, if it has grown. - * - * @param reader the new input stream - */ + /// <summary> + /// Resets the scanner to read from a new input stream. + /// Does not close the old reader. + /// <para/> + /// All internal variables are reset, the old input stream + /// <b>cannot</b> be reused (internal buffer is discarded and lost). + /// Lexical state is set to <see cref="YYINITIAL"/>. + /// <para/> + /// Internal scan buffer is resized down to its initial length, if it has grown. + /// </summary> + /// <param name="reader">the new input stream </param> public void YyReset(TextReader reader) { zzReader = reader; @@ -9287,75 +9286,73 @@ namespace Lucene.Net.Analysis.Standard } - /** - * Returns the current lexical state. - */ + /// <summary> + /// Returns the current lexical state. + /// </summary> public int YyState { get { return zzLexicalState; } } - /** - * Enters a new lexical state - * - * @param newState the new lexical state - */ + /// <summary> + /// Enters a new lexical state + /// </summary> + /// <param name="newState">the new lexical state</param> public void YyBegin(int newState) { zzLexicalState = newState; } - /** - * Returns the text matched by the current regular expression. - */ + /// <summary> + /// Returns the text matched by the current regular expression. + /// </summary> public string YyText { get { return new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); } } - /** - * Returns the character at position <tt>pos</tt> from the - * matched text. - * - * It is equivalent to YyText().charAt(pos), but faster - * - * @param pos the position of the character to fetch. - * A value from 0 to YyLength()-1. - * - * @return the character at position pos - */ + /// <summary> + /// Returns the character at position <paramref name="pos"/> from the + /// matched text. + /// <para/> + /// It is equivalent to YyText[pos], but faster + /// </summary> + /// <param name="pos"> + /// the position of the character to fetch. + /// A value from 0 to YyLength-1. + /// </param> + /// <returns>the character at position pos</returns> public char YyCharAt(int pos) { return zzBuffer[zzStartRead + pos]; } - /** - * Returns the length of the matched text region. - */ + /// <summary> + /// Returns the length of the matched text region. + /// </summary> public int YyLength { get { return zzMarkedPos - zzStartRead; } } - /** - * Reports an error that occured while scanning. - * - * In a wellformed scanner (no or only correct usage of - * YyPushBack(int) and a match-all fallback rule) this method - * will only be called with things that "Can't Possibly Happen". - * If this method is called, something is seriously wrong - * (e.g. a JFlex bug producing a faulty scanner etc.). - * - * Usual syntax/scanner level error handling should be done - * in error fallback rules. - * - * @param errorCode the code of the errormessage to display - */ + /// <summary> + /// Reports an error that occured while scanning. + /// <para/> + /// In a wellformed scanner (no or only correct usage of + /// YyPushBack(int) and a match-all fallback rule) this method + /// will only be called with things that "Can't Possibly Happen". + /// If this method is called, something is seriously wrong + /// (e.g. a JFlex bug producing a faulty scanner etc.). + /// <para/> + /// Usual syntax/scanner level error handling should be done + /// in error fallback rules. + /// </summary> + /// <param name="errorCode">the code of the errormessage to display</param> private void ZzScanError(int errorCode) { string message; @@ -9372,14 +9369,15 @@ namespace Lucene.Net.Analysis.Standard } - /** - * Pushes the specified amount of characters back into the input stream. - * - * They will be read again by then next call of the scanning method - * - * @param number the number of characters to be read again. - * This number must not be greater than YyLength()! - */ + /// <summary> + /// Pushes the specified amount of characters back into the input stream. + /// <para/> + /// They will be read again by then next call of the scanning method + /// </summary> + /// <param name="number"> + /// the number of characters to be read again. + /// This number must not be greater than YyLength! + /// </param> public void YyPushBack(int number) { if (number > YyLength) @@ -9389,13 +9387,12 @@ namespace Lucene.Net.Analysis.Standard } - /** - * Resumes scanning until the next regular expression is matched, - * the end of input is encountered or an I/O-Error occurs. - * - * @return the next token - * @exception java.io.IOException if any I/O-Error occurs - */ + /// <summary> + /// Resumes scanning until the next regular expression is matched, + /// the end of input is encountered or an I/O-Error occurs. + /// </summary> + /// <returns>the next token</returns> + /// <exception cref="IOException">if any I/O-Error occurs</exception> public int GetNextToken() { int zzInput;
