Lucene.Net.Analysis.Common.Cjk.CJKBigramFilter refactor: Converted flags (constants) into [Flags] enum named CJKScript
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/13f16f65 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/13f16f65 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/13f16f65 Branch: refs/heads/api-work Commit: 13f16f65ed1e441992b20f69ce057a5b8d5cb6f6 Parents: 37bc447 Author: Shad Storhaug <[email protected]> Authored: Mon Mar 6 18:07:27 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Mon Mar 6 18:07:27 2017 +0700 ---------------------------------------------------------------------- .../Analysis/Cjk/CJKBigramFilter.cs | 66 +++++++++++--------- .../Analysis/Cjk/CJKBigramFilterFactory.cs | 12 ++-- .../Analysis/Cjk/TestCJKBigramFilter.cs | 8 +-- 3 files changed, 48 insertions(+), 38 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/13f16f65/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs index e5e67c0..64648e9 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs @@ -22,19 +22,41 @@ namespace Lucene.Net.Analysis.Cjk * limitations under the License. */ + // LUCENENET specific - converted constants from CJKBigramFilter + // into a flags enum. + [System.Flags] + public enum CJKScript + { + /// <summary> + /// bigram flag for Han Ideographs </summary> + HAN = 1, + /// <summary> + /// bigram flag for Hiragana </summary> + HIRAGANA = 2, + /// <summary> + /// bigram flag for Katakana </summary> + KATAKANA = 4, + /// <summary> + /// bigram flag for Hangul </summary> + HANGUL = 8, + /// <summary> + /// bigram flag for all scripts </summary> + ALL = 0xff + } + /// <summary> /// Forms bigrams of CJK terms that are generated from <see cref="StandardTokenizer"/> /// or ICUTokenizer. /// <para> /// CJK types are set by these tokenizers, but you can also use - /// <see cref="CJKBigramFilter(TokenStream, int)"/> to explicitly control which + /// <see cref="CJKBigramFilter(TokenStream, CJKScript)"/> to explicitly control which /// of the CJK scripts are turned into bigrams. /// </para> /// <para> /// By default, when a CJK character has no adjacent characters to form /// a bigram, it is output in unigram form. If you want to always output /// both unigrams and bigrams, set the <code>outputUnigrams</code> - /// flag in <see cref="CJKBigramFilter.CJKBigramFilter(TokenStream, int, bool)"/>. + /// flag in <see cref="CJKBigramFilter.CJKBigramFilter(TokenStream, CJKScript, bool)"/>. /// This can be used for a combined unigram+bigram approach. /// </para> /// <para> @@ -43,21 +65,9 @@ namespace Lucene.Net.Analysis.Cjk /// </summary> public sealed class CJKBigramFilter : TokenFilter { - // LUCENENET TODO: Make the following into a [Flags] enum - // configuration - /// <summary> - /// bigram flag for Han Ideographs </summary> - public const int HAN = 1; - /// <summary> - /// bigram flag for Hiragana </summary> - public const int HIRAGANA = 2; - /// <summary> - /// bigram flag for Katakana </summary> - public const int KATAKANA = 4; - /// <summary> - /// bigram flag for Hangul </summary> - public const int HANGUL = 8; + + // LUCENENET specific - made flags into their own [Flags] enum named CJKScript and de-nested from this type /// <summary> /// when we emit a bigram, its then marked as this type </summary> @@ -107,12 +117,12 @@ namespace Lucene.Net.Analysis.Cjk /// <summary> /// Calls <see cref="CJKBigramFilter.CJKBigramFilter(TokenStream, int)"> - /// CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)</see> + /// CJKBigramFilter(@in, CJKScript.HAN | CJKScript.HIRAGANA | CJKScript.KATAKANA | CJKScript.HANGUL)</see> /// </summary> /// <param name="in"> /// Input <see cref="TokenStream"/> </param> public CJKBigramFilter(TokenStream @in) - : this(@in, HAN | HIRAGANA | KATAKANA | HANGUL) + : this(@in, CJKScript.HAN | CJKScript.HIRAGANA | CJKScript.KATAKANA | CJKScript.HANGUL) { } @@ -122,9 +132,9 @@ namespace Lucene.Net.Analysis.Cjk /// </summary> /// <param name="in"> /// Input <see cref="TokenStream"/> </param> - /// <param name="flags"> OR'ed set from <see cref="CJKBigramFilter.HAN"/>, <see cref="CJKBigramFilter.HIRAGANA"/>, - /// <see cref="CJKBigramFilter.KATAKANA"/>, <see cref="CJKBigramFilter.HANGUL"/> </param> - public CJKBigramFilter(TokenStream @in, int flags) + /// <param name="flags"> OR'ed set from <see cref="CJKScript.HAN"/>, <see cref="CJKScript.HIRAGANA"/>, + /// <see cref="CJKScript.KATAKANA"/>, <see cref="CJKScript.HANGUL"/> </param> + public CJKBigramFilter(TokenStream @in, CJKScript flags) : this(@in, flags, false) { } @@ -134,18 +144,18 @@ namespace Lucene.Net.Analysis.Cjk /// and whether or not unigrams should also be output. </summary> /// <param name="in"> /// Input <see cref="TokenStream"/> </param> - /// <param name="flags"> OR'ed set from <see cref="CJKBigramFilter.HAN"/>, <see cref="CJKBigramFilter.HIRAGANA"/>, - /// <see cref="CJKBigramFilter.KATAKANA"/>, <see cref="CJKBigramFilter.HANGUL"/> </param> + /// <param name="flags"> OR'ed set from <see cref="CJKScript.HAN"/>, <see cref="CJKScript.HIRAGANA"/>, + /// <see cref="CJKScript.KATAKANA"/>, <see cref="CJKScript.HANGUL"/> </param> /// <param name="outputUnigrams"> true if unigrams for the selected writing systems should also be output. /// when this is false, this is only done when there are no adjacent characters to form /// a bigram. </param> - public CJKBigramFilter(TokenStream @in, int flags, bool outputUnigrams) + public CJKBigramFilter(TokenStream @in, CJKScript flags, bool outputUnigrams) : base(@in) { - doHan = (flags & HAN) == 0 ? NO : HAN_TYPE; - doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE; - doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE; - doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE; + doHan = (flags & CJKScript.HAN) == 0 ? NO : HAN_TYPE; + doHiragana = (flags & CJKScript.HIRAGANA) == 0 ? NO : HIRAGANA_TYPE; + doKatakana = (flags & CJKScript.KATAKANA) == 0 ? NO : KATAKANA_TYPE; + doHangul = (flags & CJKScript.HANGUL) == 0 ? NO : HANGUL_TYPE; this.outputUnigrams = outputUnigrams; this.termAtt = AddAttribute<ICharTermAttribute>(); this.typeAtt = AddAttribute<ITypeAttribute>(); http://git-wip-us.apache.org/repos/asf/lucenenet/blob/13f16f65/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs index b9e4d97..d022605 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs @@ -37,7 +37,7 @@ namespace Lucene.Net.Analysis.Cjk /// </summary> public class CJKBigramFilterFactory : TokenFilterFactory { - private readonly int flags; + private readonly CJKScript flags; private readonly bool outputUnigrams; /// <summary> @@ -45,22 +45,22 @@ namespace Lucene.Net.Analysis.Cjk public CJKBigramFilterFactory(IDictionary<string, string> args) : base(args) { - int flags = 0; + CJKScript flags = 0; if (GetBoolean(args, "han", true)) { - flags |= CJKBigramFilter.HAN; + flags |= CJKScript.HAN; } if (GetBoolean(args, "hiragana", true)) { - flags |= CJKBigramFilter.HIRAGANA; + flags |= CJKScript.HIRAGANA; } if (GetBoolean(args, "katakana", true)) { - flags |= CJKBigramFilter.KATAKANA; + flags |= CJKScript.KATAKANA; } if (GetBoolean(args, "hangul", true)) { - flags |= CJKBigramFilter.HANGUL; + flags |= CJKScript.HANGUL; } this.flags = flags; this.outputUnigrams = GetBoolean(args, "outputUnigrams", false); http://git-wip-us.apache.org/repos/asf/lucenenet/blob/13f16f65/src/Lucene.Net.Tests.Analysis.Common/Analysis/Cjk/TestCJKBigramFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Cjk/TestCJKBigramFilter.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Cjk/TestCJKBigramFilter.cs index 93f5109..b3f5066 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Cjk/TestCJKBigramFilter.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Cjk/TestCJKBigramFilter.cs @@ -50,7 +50,7 @@ namespace Lucene.Net.Analysis.Cjk protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader); - return new TokenStreamComponents(t, new CJKBigramFilter(t, 0xff, true)); + return new TokenStreamComponents(t, new CJKBigramFilter(t, (CJKScript)0xff, true)); } } @@ -79,7 +79,7 @@ namespace Lucene.Net.Analysis.Cjk protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader); - return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN)); + return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKScript.HAN)); } } @@ -102,7 +102,7 @@ namespace Lucene.Net.Analysis.Cjk protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader); - return new TokenStreamComponents(t, new CJKBigramFilter(t, 0xff, false)); + return new TokenStreamComponents(t, new CJKBigramFilter(t, (CJKScript)0xff, false)); } } @@ -131,7 +131,7 @@ namespace Lucene.Net.Analysis.Cjk protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader); - return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true)); + return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKScript.HAN, true)); } }
