Lucene.Net.Analysis.Cjk refactor: member accessibility and documentation comments
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/917b4fdf Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/917b4fdf Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/917b4fdf Branch: refs/heads/api-work Commit: 917b4fdf53f978f32219cef6edf31f3c30b84dea Parents: 7fdbd66 Author: Shad Storhaug <[email protected]> Authored: Thu Feb 2 21:53:51 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Thu Feb 2 21:53:51 2017 +0700 ---------------------------------------------------------------------- .../Analysis/Cjk/CJKAnalyzer.cs | 12 ++--- .../Analysis/Cjk/CJKBigramFilter.cs | 32 ++++++------- .../Analysis/Cjk/CJKBigramFilterFactory.cs | 13 +++--- .../Analysis/Cjk/CJKTokenizer.cs | 14 +++--- .../Analysis/Cjk/CJKTokenizerFactory.cs | 13 +++--- .../Analysis/Cjk/CJKWidthFilter.cs | 49 ++++++++++++++------ .../Analysis/Cjk/CJKWidthFilterFactory.cs | 10 ++-- 7 files changed, 82 insertions(+), 61 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs index 0fcc42c..28c7a52 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs @@ -25,16 +25,16 @@ namespace Lucene.Net.Analysis.Cjk */ /// <summary> - /// An <seealso cref="Analyzer"/> that tokenizes text with <seealso cref="StandardTokenizer"/>, - /// normalizes content with <seealso cref="CJKWidthFilter"/>, folds case with - /// <seealso cref="LowerCaseFilter"/>, forms bigrams of CJK with <seealso cref="CJKBigramFilter"/>, - /// and filters stopwords with <seealso cref="StopFilter"/> + /// An <see cref="Analyzer"/> that tokenizes text with <see cref="StandardTokenizer"/>, + /// normalizes content with <see cref="CJKWidthFilter"/>, folds case with + /// <see cref="LowerCaseFilter"/>, forms bigrams of CJK with <see cref="CJKBigramFilter"/>, + /// and filters stopwords with <see cref="StopFilter"/> /// </summary> public sealed class CJKAnalyzer : StopwordAnalyzerBase { /// <summary> /// File containing default CJK stopwords. - /// <p/> + /// <para/> /// Currently it contains some common English words that are not usually /// useful for searching and some double-byte interpunctions. /// </summary> @@ -72,7 +72,7 @@ namespace Lucene.Net.Analysis.Cjk } /// <summary> - /// Builds an analyzer which removes words in <seealso cref="#getDefaultStopSet()"/>. + /// Builds an analyzer which removes words in <see cref="DefaultStopSet"/>. /// </summary> public CJKAnalyzer(LuceneVersion matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET) http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs index 4b8cb17..443ea04 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs @@ -23,18 +23,18 @@ namespace Lucene.Net.Analysis.Cjk */ /// <summary> - /// Forms bigrams of CJK terms that are generated from StandardTokenizer + /// Forms bigrams of CJK terms that are generated from <see cref="StandardTokenizer"/> /// or ICUTokenizer. /// <para> /// CJK types are set by these tokenizers, but you can also use - /// <seealso cref="#CJKBigramFilter(TokenStream, int)"/> to explicitly control which + /// <see cref="CJKBigramFilter(TokenStream, int)"/> to explicitly control which /// of the CJK scripts are turned into bigrams. /// </para> /// <para> /// By default, when a CJK character has no adjacent characters to form /// a bigram, it is output in unigram form. If you want to always output /// both unigrams and bigrams, set the <code>outputUnigrams</code> - /// flag in <seealso cref="CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)"/>. + /// flag in <see cref="CJKBigramFilter.CJKBigramFilter(TokenStream, int, bool)"/>. /// This can be used for a combined unigram+bigram approach. /// </para> /// <para> @@ -90,22 +90,22 @@ namespace Lucene.Net.Analysis.Cjk private readonly IPositionLengthAttribute posLengthAtt; // buffers containing codepoint and offsets in parallel - internal int[] buffer = new int[8]; - internal int[] startOffset = new int[8]; - internal int[] endOffset = new int[8]; + private int[] buffer = new int[8]; + private int[] startOffset = new int[8]; + private int[] endOffset = new int[8]; // length of valid buffer - internal int bufferLen; + private int bufferLen; // current buffer index - internal int index; + private int index; // the last end offset, to determine if we should bigram across tokens - internal int lastEndOffset; + private int lastEndOffset; private bool exhausted; /// <summary> - /// Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int) - /// CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)} + /// Calls <see cref="CJKBigramFilter.CJKBigramFilter(TokenStream, int)"> + /// CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)</see> /// </summary> public CJKBigramFilter(TokenStream @in) : this(@in, HAN | HIRAGANA | KATAKANA | HANGUL) @@ -113,8 +113,8 @@ namespace Lucene.Net.Analysis.Cjk } /// <summary> - /// Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean) - /// CJKBigramFilter(in, flags, false)} + /// Calls <see cref="CJKBigramFilter.CJKBigramFilter(TokenStream, int, bool)"> + /// CJKBigramFilter(in, flags, false)</see> /// </summary> public CJKBigramFilter(TokenStream @in, int flags) : this(@in, flags, false) @@ -122,10 +122,10 @@ namespace Lucene.Net.Analysis.Cjk } /// <summary> - /// Create a new CJKBigramFilter, specifying which writing systems should be bigrammed, + /// Create a new <see cref="CJKBigramFilter"/>, specifying which writing systems should be bigrammed, /// and whether or not unigrams should also be output. </summary> - /// <param name="flags"> OR'ed set from <seealso cref="CJKBigramFilter#HAN"/>, <seealso cref="CJKBigramFilter#HIRAGANA"/>, - /// <seealso cref="CJKBigramFilter#KATAKANA"/>, <seealso cref="CJKBigramFilter#HANGUL"/> </param> + /// <param name="flags"> OR'ed set from <see cref="CJKBigramFilter.HAN"/>, <see cref="CJKBigramFilter.HIRAGANA"/>, + /// <see cref="CJKBigramFilter.KATAKANA"/>, <see cref="CJKBigramFilter.HANGUL"/> </param> /// <param name="outputUnigrams"> true if unigrams for the selected writing systems should also be output. /// when this is false, this is only done when there are no adjacent characters to form /// a bigram. </param> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs index 8fd34fd..b9e4d97 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs @@ -21,8 +21,8 @@ namespace Lucene.Net.Analysis.Cjk */ /// <summary> - /// Factory for <seealso cref="CJKBigramFilter"/>. - /// <pre class="prettyprint"> + /// Factory for <see cref="CJKBigramFilter"/>. + /// <code> /// <fieldType name="text_cjk" class="solr.TextField"> /// <analyzer> /// <tokenizer class="solr.StandardTokenizerFactory"/> @@ -32,15 +32,16 @@ namespace Lucene.Net.Analysis.Cjk /// han="true" hiragana="true" /// katakana="true" hangul="true" outputUnigrams="false" /> /// </analyzer> - /// </fieldType></pre> + /// </fieldType> + /// </code> /// </summary> public class CJKBigramFilterFactory : TokenFilterFactory { - internal readonly int flags; - internal readonly bool outputUnigrams; + private readonly int flags; + private readonly bool outputUnigrams; /// <summary> - /// Creates a new CJKBigramFilterFactory </summary> + /// Creates a new <see cref="CJKBigramFilterFactory"/> </summary> public CJKBigramFilterFactory(IDictionary<string, string> args) : base(args) { http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs index 1ff4f07..160306d 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs @@ -32,11 +32,11 @@ namespace Lucene.Net.Analysis.Cjk /// Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4". /// </para> /// Additionally, the following is applied to Latin text (such as English): - /// <ul> - /// <li>Text is converted to lowercase. - /// <li>Numeric digits, '+', '#', and '_' are tokenized as letters. - /// <li>Full-width forms are converted to half-width forms. - /// </ul> + /// <list type="bullet"> + /// <item>Text is converted to lowercase.</item> + /// <item>Numeric digits, '+', '#', and '_' are tokenized as letters.</item> + /// <item>Full-width forms are converted to half-width forms.</item> + /// </list> /// For more info on Asian language (Chinese, Japanese, and Korean) text segmentation: /// please search <a /// href="http://www.google.com/search?q=word+chinese+segment">google</a> @@ -145,7 +145,7 @@ namespace Lucene.Net.Analysis.Cjk /// </summary> /// <returns> false for end of stream, true otherwise /// </returns> - /// <exception cref="java.io.IOException"> - throw IOException when read error <br> + /// <exception cref="IOException"> when read error /// happened in the InputStream /// </exception> public override bool IncrementToken() @@ -347,7 +347,7 @@ namespace Lucene.Net.Analysis.Cjk } } - public override void End() + public override sealed void End() { base.End(); // set final offset http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs index 220a7d6..c33f3a6 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs @@ -25,20 +25,21 @@ namespace Lucene.Net.Analysis.Cjk /// <summary> - /// Factory for <seealso cref="CJKTokenizer"/>. - /// <pre class="prettyprint" > + /// Factory for <see cref="CJKTokenizer"/>. + /// <code> /// <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100"> /// <analyzer> /// <tokenizer class="solr.CJKTokenizerFactory"/> /// </analyzer> - /// </fieldType></pre> </summary> - /// @deprecated Use <seealso cref="CJKBigramFilterFactory"/> instead. + /// </fieldType> + /// </code> + /// </summary> + /// @deprecated Use <see cref="CJKBigramFilterFactory"/> instead. [Obsolete("Use CJKBigramFilterFactory instead.")] public class CJKTokenizerFactory : TokenizerFactory { - /// <summary> - /// Creates a new CJKTokenizerFactory </summary> + /// Creates a new <see cref="CJKTokenizerFactory"/> </summary> public CJKTokenizerFactory(IDictionary<string, string> args) : base(args) { if (args.Count > 0) http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs index 331de6b..64018e2 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs @@ -22,11 +22,11 @@ namespace Lucene.Net.Analysis.Cjk */ /// <summary> - /// A <seealso cref="TokenFilter"/> that normalizes CJK width differences: - /// <ul> - /// <li>Folds fullwidth ASCII variants into the equivalent basic latin - /// <li>Folds halfwidth Katakana variants into the equivalent kana - /// </ul> + /// A <see cref="TokenFilter"/> that normalizes CJK width differences: + /// <list type="bullet"> + /// <item>Folds fullwidth ASCII variants into the equivalent basic latin</item> + /// <item>Folds halfwidth Katakana variants into the equivalent kana</item> + /// </list> /// <para> /// NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD /// Unicode normalization. See the normalization support in the ICU package @@ -37,13 +37,22 @@ namespace Lucene.Net.Analysis.Cjk { private ICharTermAttribute termAtt; - /* halfwidth kana mappings: 0xFF65-0xFF9D - * - * note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A - * as a fallback when they cannot properly combine with a preceding - * character into a composed form. - */ - private static readonly char[] KANA_NORM = new char[] { (char)0x30fb, (char)0x30f2, (char)0x30a1, (char)0x30a3, (char)0x30a5, (char)0x30a7, (char)0x30a9, (char)0x30e3, (char)0x30e5, (char)0x30e7, (char)0x30c3, (char)0x30fc, (char)0x30a2, (char)0x30a4, (char)0x30a6, (char)0x30a8, (char)0x30aa, (char)0x30ab, (char)0x30ad, (char)0x30af, (char)0x30b1, (char)0x30b3, (char)0x30b5, (char)0x30b7, (char)0x30b9, (char)0x30bb, (char)0x30bd, (char)0x30bf, (char)0x30c1, (char)0x30c4, (char)0x30c6, (char)0x30c8, (char)0x30ca, (char)0x30cb, (char)0x30cc, (char)0x30cd, (char)0x30ce, (char)0x30cf, (char)0x30d2, (char)0x30d5, (char)0x30d8, (char)0x30db, (char)0x30de, (char)0x30df, (char)0x30e0, (char)0x30e1, (char)0x30e2, (char)0x30e4, (char)0x30e6, (char)0x30e8, (char)0x30e9, (char)0x30ea, (char)0x30eb, (char)0x30ec, (char)0x30ed, (char)0x30ef, (char)0x30f3, (char)0x3099, (char)0x309A }; + /// <summary> + /// halfwidth kana mappings: 0xFF65-0xFF9D + /// <para/> + /// note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A + /// as a fallback when they cannot properly combine with a preceding + /// character into a composed form. + /// </summary> + private static readonly char[] KANA_NORM = new char[] { + (char)0x30fb, (char)0x30f2, (char)0x30a1, (char)0x30a3, (char)0x30a5, (char)0x30a7, (char)0x30a9, (char)0x30e3, (char)0x30e5, + (char)0x30e7, (char)0x30c3, (char)0x30fc, (char)0x30a2, (char)0x30a4, (char)0x30a6, (char)0x30a8, (char)0x30aa, (char)0x30ab, + (char)0x30ad, (char)0x30af, (char)0x30b1, (char)0x30b3, (char)0x30b5, (char)0x30b7, (char)0x30b9, (char)0x30bb, (char)0x30bd, + (char)0x30bf, (char)0x30c1, (char)0x30c4, (char)0x30c6, (char)0x30c8, (char)0x30ca, (char)0x30cb, (char)0x30cc, (char)0x30cd, + (char)0x30ce, (char)0x30cf, (char)0x30d2, (char)0x30d5, (char)0x30d8, (char)0x30db, (char)0x30de, (char)0x30df, (char)0x30e0, + (char)0x30e1, (char)0x30e2, (char)0x30e4, (char)0x30e6, (char)0x30e8, (char)0x30e9, (char)0x30ea, (char)0x30eb, (char)0x30ec, + (char)0x30ed, (char)0x30ef, (char)0x30f3, (char)0x3099, (char)0x309A + }; public CJKWidthFilter(TokenStream input) : base(input) @@ -87,10 +96,20 @@ namespace Lucene.Net.Analysis.Cjk } } - /* kana combining diffs: 0x30A6-0x30FD */ - private static readonly sbyte[] KANA_COMBINE_VOICED = new sbyte[] { 78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; + /// <summary>kana combining diffs: 0x30A6-0x30FD </summary> + private static readonly sbyte[] KANA_COMBINE_VOICED = new sbyte[] { + 78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, + 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 + }; - private static readonly sbyte[] KANA_COMBINE_HALF_VOICED = new sbyte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + private static readonly sbyte[] KANA_COMBINE_HALF_VOICED = new sbyte[] { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, + 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; /// <summary> /// returns true if we successfully combined the voice mark </summary> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/917b4fdf/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs index dfe8f2e..9c956e6 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs @@ -21,8 +21,8 @@ namespace Lucene.Net.Analysis.Cjk */ /// <summary> - /// Factory for <seealso cref="CJKWidthFilter"/>. - /// <pre class="prettyprint"> + /// Factory for <see cref="CJKWidthFilter"/>. + /// <code> /// <fieldType name="text_cjk" class="solr.TextField"> /// <analyzer> /// <tokenizer class="solr.StandardTokenizerFactory"/> @@ -30,13 +30,13 @@ namespace Lucene.Net.Analysis.Cjk /// <filter class="solr.LowerCaseFilterFactory"/> /// <filter class="solr.CJKBigramFilterFactory"/> /// </analyzer> - /// </fieldType></pre> + /// </fieldType> + /// </code> /// </summary> public class CJKWidthFilterFactory : TokenFilterFactory, IMultiTermAwareComponent { - /// <summary> - /// Creates a new CJKWidthFilterFactory </summary> + /// Creates a new <see cref="CJKWidthFilterFactory"/> </summary> public CJKWidthFilterFactory(IDictionary<string, string> args) : base(args) { if (args.Count > 0)
