http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilterFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilterFactory.cs new file mode 100644 index 0000000..5524be7 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilterFactory.cs @@ -0,0 +1,52 @@ +using Lucene.Net.Analysis.Util; +using System; +using System.Collections.Generic; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Factory for <see cref="JapaneseBaseFormFilter"/>. + /// <code> + /// <fieldType name="text_ja" class="solr.TextField"> + /// <analyzer> + /// <tokenizer class="solr.JapaneseTokenizerFactory"/> + /// <filter class="solr.JapaneseBaseFormFilterFactory"/> + /// </analyzer> + /// </fieldType> + /// </code> + /// </summary> + public class JapaneseBaseFormFilterFactory : TokenFilterFactory + { + /// <summary>Creates a new <see cref="JapaneseBaseFormFilterFactory"/></summary> + public JapaneseBaseFormFilterFactory(IDictionary<string, string> args) + : base(args) + { + if (args.Count > 0) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + public override TokenStream Create(TokenStream input) + { + return new JapaneseBaseFormFilter(input); + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilter.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilter.cs new file mode 100644 index 0000000..71566bb --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilter.cs @@ -0,0 +1,500 @@ +using Lucene.Net.Analysis.Util; +using System.Diagnostics; +using System.IO; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form. + /// </summary> + /// <remarks> + /// Sequences of iteration marks are supported. In case an illegal sequence of iteration + /// marks is encountered, the implementation emits the illegal source character as-is + /// without considering its script. For example, with input "?ゝ", we get + /// "??" even though "?" isn't hiragana. + /// <para/> + /// Note that a full stop punctuation character "。" (U+3002) can not be iterated + /// (see below). Iteration marks themselves can be emitted in case they are illegal, + /// i.e. if they go back past the beginning of the character stream. + /// <para/> + /// The implementation buffers input until a full stop punctuation character (U+3002) + /// or EOF is reached in order to not keep a copy of the character stream in memory. + /// Vertical iteration marks, which are even rarer than horizontal iteration marks in + /// contemporary Japanese, are unsupported. + /// </remarks> + public class JapaneseIterationMarkCharFilter : CharFilter + { + /// <summary>Normalize kanji iteration marks by default</summary> + public static readonly bool NORMALIZE_KANJI_DEFAULT = true; + + /// <summary>Normalize kana iteration marks by default</summary> + public static readonly bool NORMALIZE_KANA_DEFAULT = true; + + private const char KANJI_ITERATION_MARK = '\u3005'; // ã + + private const char HIRAGANA_ITERATION_MARK = '\u309d'; // ã + + private const char HIRAGANA_VOICED_ITERATION_MARK = '\u309e'; // ã + + private const char KATAKANA_ITERATION_MARK = '\u30fd'; // ã½ + + private const char KATAKANA_VOICED_ITERATION_MARK = '\u30fe'; // ã¾ + + private const char FULL_STOP_PUNCTUATION = '\u3002'; // ã + + // Hiragana to dakuten map (lookup using code point - 0x30abï¼ãï¼*/ + private static char[] h2d = new char[50]; + + // Katakana to dakuten map (lookup using code point - 0x30abï¼ã« + private static char[] k2d = new char[50]; + + private readonly RollingCharBuffer buffer = new RollingCharBuffer(); + + private int bufferPosition = 0; + + private int iterationMarksSpanSize = 0; + + private int iterationMarkSpanEndPosition = 0; + + private bool normalizeKanji; + + private bool normalizeKana; + + static JapaneseIterationMarkCharFilter() + { + // Hiragana dakuten map + h2d[0] = '\u304c'; // ã => ã + h2d[1] = '\u304c'; // ã => ã + h2d[2] = '\u304e'; // ã => ã + h2d[3] = '\u304e'; // ã => ã + h2d[4] = '\u3050'; // ã => ã + h2d[5] = '\u3050'; // ã => ã + h2d[6] = '\u3052'; // ã => ã + h2d[7] = '\u3052'; // ã => ã + h2d[8] = '\u3054'; // ã => ã + h2d[9] = '\u3054'; // ã => ã + h2d[10] = '\u3056'; // ã => ã + h2d[11] = '\u3056'; // ã => ã + h2d[12] = '\u3058'; // ã => ã + h2d[13] = '\u3058'; // ã => ã + h2d[14] = '\u305a'; // ã => ã + h2d[15] = '\u305a'; // ã => ã + h2d[16] = '\u305c'; // ã => ã + h2d[17] = '\u305c'; // ã => ã + h2d[18] = '\u305e'; // ã => ã + h2d[19] = '\u305e'; // ã => ã + h2d[20] = '\u3060'; // ã => ã + h2d[21] = '\u3060'; // ã => ã + h2d[22] = '\u3062'; // ã¡ => 㢠+ h2d[23] = '\u3062'; // 㢠=> 㢠+ h2d[24] = '\u3063'; + h2d[25] = '\u3065'; // 㤠=> 㥠+ h2d[26] = '\u3065'; // 㥠=> 㥠+ h2d[27] = '\u3067'; // 㦠=> ã§ + h2d[28] = '\u3067'; // ã§ => ã§ + h2d[29] = '\u3069'; // 㨠=> ã© + h2d[30] = '\u3069'; // ã© => ã© + h2d[31] = '\u306a'; + h2d[32] = '\u306b'; + h2d[33] = '\u306c'; + h2d[34] = '\u306d'; + h2d[35] = '\u306e'; + h2d[36] = '\u3070'; // 㯠=> ã° + h2d[37] = '\u3070'; // ã° => ã° + h2d[38] = '\u3071'; + h2d[39] = '\u3073'; // ã² => ã³ + h2d[40] = '\u3073'; // ã³ => ã³ + h2d[41] = '\u3074'; + h2d[42] = '\u3076'; // ãµ => ã¶ + h2d[43] = '\u3076'; // ã¶ => ã¶ + h2d[44] = '\u3077'; + h2d[45] = '\u3079'; // 㸠=> ã¹ + h2d[46] = '\u3079'; // ã¹ => ã¹ + h2d[47] = '\u307a'; + h2d[48] = '\u307c'; // ã» => ã¼ + h2d[49] = '\u307c'; // ã¼ => ã¼ + + // Make katakana dakuten map from hiragana map + char codePointDifference = (char)('\u30ab' - '\u304b'); // ã« - ã + Debug.Assert(h2d.Length == k2d.Length); + for (int i = 0; i < k2d.Length; i++) + { + k2d[i] = (char)(h2d[i] + codePointDifference); + } + } + + /// <summary> + /// Constructor. Normalizes both kanji and kana iteration marks by default. + /// </summary> + /// <param name="input">Char stream.</param> + public JapaneseIterationMarkCharFilter(TextReader input) + : this(input, NORMALIZE_KANJI_DEFAULT, NORMALIZE_KANA_DEFAULT) + { + } + + /// <summary> + /// Constructor + /// </summary> + /// <param name="input">Char stream.</param> + /// <param name="normalizeKanji">Indicates whether kanji iteration marks should be normalized.</param> + /// <param name="normalizeKana">Indicates whether kana iteration marks should be normalized.</param> + public JapaneseIterationMarkCharFilter(TextReader input, bool normalizeKanji, bool normalizeKana) + : base(input) + { + this.normalizeKanji = normalizeKanji; + this.normalizeKana = normalizeKana; + buffer.Reset(input); + } + + /// <summary> + /// Reads a specified maximum number of characters from the current reader and writes the data to a buffer, beginning at the specified index. + /// </summary> + /// <param name="buffer"> + /// When this method returns, contains the specified character array with the values between index and (index + count - 1) + /// replaced by the characters read from the current source.</param> + /// <param name="offset"> + /// The position in buffer at which to begin writing. + /// </param> + /// <param name="length"> + /// The maximum number of characters to read. If the end of the reader is reached before the specified number of characters is + /// read into the buffer, the method returns. + /// </param> + /// <returns> + /// The number of characters that have been read. The number will be less than or equal to count, depending on whether the data is + /// available within the reader. This method returns 0 (zero) if it is called when no more characters are left to read. + /// </returns> + public override int Read(char[] buffer, int offset, int length) + { + int read = 0; + + for (int i = offset; i < offset + length; i++) + { + int c = Read(); + if (c == -1) + { + break; + } + buffer[i] = (char)c; + read++; + } + + return read == 0 ? -1 : read; + } + + /// <summary> + /// Reads the next character from the text reader and advances the character position by one character. + /// </summary> + /// <returns>The next character from the text reader, or -1 if no more characters are available.</returns> + public override int Read() + { + int ic = buffer.Get(bufferPosition); + + // End of input + if (ic == -1) + { + buffer.FreeBefore(bufferPosition); + return ic; + } + + char c = (char)ic; + + // Skip surrogate pair characters + if (char.IsHighSurrogate(c) || char.IsLowSurrogate(c)) + { + iterationMarkSpanEndPosition = bufferPosition + 1; + } + + // Free rolling buffer on full stop + if (c == FULL_STOP_PUNCTUATION) + { + buffer.FreeBefore(bufferPosition); + iterationMarkSpanEndPosition = bufferPosition + 1; + } + + // Normalize iteration mark + if (IsIterationMark(c)) + { + c = NormalizeIterationMark(c); + } + + bufferPosition++; + return c; + } + + /// <summary> + /// Normalizes the iteration mark character <paramref name="c"/> + /// </summary> + /// <param name="c">Iteration mark character to normalize.</param> + /// <returns>Normalized iteration mark.</returns> + /// <exception cref="IOException">If there is a low-level I/O error.</exception> + private char NormalizeIterationMark(char c) + { + + // Case 1: Inside an iteration mark span + if (bufferPosition < iterationMarkSpanEndPosition) + { + return Normalize(SourceCharacter(bufferPosition, iterationMarksSpanSize), c); + } + + // Case 2: New iteration mark spans starts where the previous one ended, which is illegal + if (bufferPosition == iterationMarkSpanEndPosition) + { + // Emit the illegal iteration mark and increase end position to indicate that we can't + // start a new span on the next position either + iterationMarkSpanEndPosition++; + return c; + } + + // Case 3: New iteration mark span + iterationMarksSpanSize = NextIterationMarkSpanSize(); + iterationMarkSpanEndPosition = bufferPosition + iterationMarksSpanSize; + return Normalize(SourceCharacter(bufferPosition, iterationMarksSpanSize), c); + } + + /// <summary> + /// Finds the number of subsequent next iteration marks + /// </summary> + /// <returns>Number of iteration marks starting at the current buffer position.</returns> + /// <exception cref="IOException">If there is a low-level I/O error.</exception> + private int NextIterationMarkSpanSize() + { + int spanSize = 0; + for (int i = bufferPosition; buffer.Get(i) != -1 && IsIterationMark((char)(buffer.Get(i))); i++) + { + spanSize++; + } + // Restrict span size so that we don't go past the previous end position + if (bufferPosition - spanSize < iterationMarkSpanEndPosition) + { + spanSize = bufferPosition - iterationMarkSpanEndPosition; + } + return spanSize; + } + + /// <summary> + /// Returns the source character for a given position and iteration mark span size. + /// </summary> + /// <param name="position">Buffer position (should not exceed bufferPosition).</param> + /// <param name="spanSize">Iteration mark span size.</param> + /// <returns>Source character.</returns> + /// <exception cref="IOException">If there is a low-level I/O error.</exception> + private char SourceCharacter(int position, int spanSize) + { + return (char)buffer.Get(position - spanSize); + } + + /// <summary> + /// Normalize a character. + /// </summary> + /// <param name="c">Character to normalize.</param> + /// <param name="m">Repetition mark referring to <paramref name="c"/>.</param> + /// <returns>Normalized character - return c on illegal iteration marks.</returns> + private char Normalize(char c, char m) + { + if (IsHiraganaIterationMark(m)) + { + return NormalizedHiragana(c, m); + } + + if (IsKatakanaIterationMark(m)) + { + return NormalizedKatakana(c, m); + } + + return c; // If m is not kana and we are to normalize it, we assume it is kanji and simply return it + } + + /// <summary> + /// Normalize hiragana character. + /// </summary> + /// <param name="c">Hiragana character.</param> + /// <param name="m">Repetition mark referring to <paramref name="c"/>.</param> + /// <returns>Normalized character - return <paramref name="c"/> on illegal iteration marks.</returns> + private char NormalizedHiragana(char c, char m) + { + switch (m) + { + case HIRAGANA_ITERATION_MARK: + return IsHiraganaDakuten(c) ? (char)(c - 1) : c; + case HIRAGANA_VOICED_ITERATION_MARK: + return LookupHiraganaDakuten(c); + default: + return c; + } + } + + /// <summary> + /// Normalize katakana character. + /// </summary> + /// <param name="c">Katakana character.</param> + /// <param name="m">Repetition mark referring to <paramref name="c"/>.</param> + /// <returns>Normalized character - return <paramref name="c"/> on illegal iteration marks.</returns> + private char NormalizedKatakana(char c, char m) + { + switch (m) + { + case KATAKANA_ITERATION_MARK: + return IsKatakanaDakuten(c) ? (char)(c - 1) : c; + case KATAKANA_VOICED_ITERATION_MARK: + return LookupKatakanaDakuten(c); + default: + return c; + } + } + + /// <summary> + /// Iteration mark character predicate. + /// </summary> + /// <param name="c">Character to test.</param> + /// <returns><c>true</c> if <paramref name="c"/> is an iteration mark character. Otherwise <c>false</c>.</returns> + private bool IsIterationMark(char c) + { + return IsKanjiIterationMark(c) || IsHiraganaIterationMark(c) || IsKatakanaIterationMark(c); + } + + /// <summary> + /// Hiragana iteration mark character predicate. + /// </summary> + /// <param name="c">Character to test.</param> + /// <returns><c>true</c> if <paramref name="c"/> is a hiragana iteration mark character. Otherwise <c>false</c>.</returns> + private bool IsHiraganaIterationMark(char c) + { + if (normalizeKana) + { + return c == HIRAGANA_ITERATION_MARK || c == HIRAGANA_VOICED_ITERATION_MARK; + } + else + { + return false; + } + } + + /// <summary> + /// Katakana iteration mark character predicate. + /// </summary> + /// <param name="c">Character to test.</param> + /// <returns><c>true</c> if c is a katakana iteration mark character. Otherwise <c>false</c>.</returns> + private bool IsKatakanaIterationMark(char c) + { + if (normalizeKana) + { + return c == KATAKANA_ITERATION_MARK || c == KATAKANA_VOICED_ITERATION_MARK; + } + else + { + return false; + } + } + + /// <summary> + /// Kanji iteration mark character predicate. + /// </summary> + /// <param name="c">Character to test.</param> + /// <returns><c>true</c> if c is a kanji iteration mark character. Otherwise <c>false</c>.</returns> + private bool IsKanjiIterationMark(char c) + { + if (normalizeKanji) + { + return c == KANJI_ITERATION_MARK; + } + else + { + return false; + } + } + + /// <summary> + /// Look up hiragana dakuten. + /// </summary> + /// <param name="c">Character to look up.</param> + /// <returns>Hiragana dakuten variant of c or c itself if no dakuten variant exists.</returns> + private char LookupHiraganaDakuten(char c) + { + return Lookup(c, h2d, '\u304b'); // Code point is for ã + } + + /// <summary> + /// Look up katakana dakuten. Only full-width katakana are supported. + /// </summary> + /// <param name="c">Character to look up.</param> + /// <returns>Katakana dakuten variant of <paramref name="c"/> or <paramref name="c"/> itself if no dakuten variant exists.</returns> + private char LookupKatakanaDakuten(char c) + { + return Lookup(c, k2d, '\u30ab'); // Code point is for ã« + } + + /// <summary> + /// Hiragana dakuten predicate. + /// </summary> + /// <param name="c">Character to check.</param> + /// <returns><c>true</c> if c is a hiragana dakuten and otherwise <c>false</c>.</returns> + private bool IsHiraganaDakuten(char c) + { + return Inside(c, h2d, '\u304b') && c == LookupHiraganaDakuten(c); + } + + /// <summary> + /// Katakana dakuten predicate. + /// </summary> + /// <param name="c">Character to check.</param> + /// <returns><c>true</c> if c is a hiragana dakuten and otherwise <c>false</c>.</returns> + private bool IsKatakanaDakuten(char c) + { + return Inside(c, k2d, '\u30ab') && c == LookupKatakanaDakuten(c); + } + + /// <summary> + /// Looks up a character in dakuten map and returns the dakuten variant if it exists. + /// Otherwise return the character being looked up itself. + /// </summary> + /// <param name="c">Character to look up.</param> + /// <param name="map">Dakuten map.</param> + /// <param name="offset">Code point offset from <paramref name="c"/>.</param> + /// <returns>Mapped character or <paramref name="c"/> if no mapping exists.</returns> + private char Lookup(char c, char[] map, char offset) + { + if (!Inside(c, map, offset)) + { + return c; + } + else + { + return map[c - offset]; + } + } + + /// <summary> + /// Predicate indicating if the lookup character is within dakuten map range. + /// </summary> + /// <param name="c">Character to look up.</param> + /// <param name="map">Dakuten map.</param> + /// <param name="offset">Code point offset from <paramref name="c"/>.</param> + /// <returns><c>true</c> if <paramref name="c"/> is mapped by map and otherwise <c>false</c>.</returns> + private bool Inside(char c, char[] map, char offset) + { + return c >= offset && c < offset + map.Length; + } + + protected override int Correct(int currentOff) + { + return currentOff; // this filter doesn't change the length of strings + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilterFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilterFactory.cs new file mode 100644 index 0000000..c9518c9 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilterFactory.cs @@ -0,0 +1,66 @@ +using Lucene.Net.Analysis.Util; +using System; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Factory for <see cref="JapaneseIterationMarkCharFilter"/>. + /// <code> + /// <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false"> + /// <analyzer> + /// <charFilter class="solr.JapaneseIterationMarkCharFilterFactory normalizeKanji="true" normalizeKana="true"/> + /// <tokenizer class="solr.JapaneseTokenizerFactory"/> + /// </analyzer> + /// </fieldType> + /// </code> + /// </summary> + public class JapaneseIterationMarkCharFilterFactory : CharFilterFactory, IMultiTermAwareComponent + { + private static readonly string NORMALIZE_KANJI_PARAM = "normalizeKanji"; + private static readonly string NORMALIZE_KANA_PARAM = "normalizeKana"; + + private readonly bool normalizeKanji; + private readonly bool normalizeKana; + + /// <summary>Creates a new <see cref="JapaneseIterationMarkCharFilterFactory"/></summary> + public JapaneseIterationMarkCharFilterFactory(IDictionary<string, string> args) + : base(args) + { + normalizeKanji = GetBoolean(args, NORMALIZE_KANJI_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT); + normalizeKana = GetBoolean(args, NORMALIZE_KANA_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT); + if (args.Count > 0) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + public override TextReader Create(TextReader input) + { + return new JapaneseIterationMarkCharFilter(input, normalizeKanji, normalizeKana); + } + + public virtual AbstractAnalysisFactory GetMultiTermComponent() + { + return this; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilter.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilter.cs new file mode 100644 index 0000000..857e5bf --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilter.cs @@ -0,0 +1,111 @@ +using Lucene.Net.Analysis.TokenAttributes; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A <see cref="TokenFilter"/> that normalizes common katakana spelling variations + /// ending in a long sound character by removing this character (U+30FC). Only + /// katakana words longer than a minimum length are stemmed (default is four). + /// </summary> + /// <remarks> + /// Note that only full-width katakana characters are supported. Please use a + /// <see cref="Cjk.CJKWidthFilter"/> to convert half-width + /// katakana to full-width before using this filter. + /// <para/> + /// In order to prevent terms from being stemmed, use an instance of + /// <see cref="Miscellaneous.SetKeywordMarkerFilter"/> + /// or a custom <see cref="TokenFilter"/> that sets the <see cref="IKeywordAttribute"/> + /// before this <see cref="TokenStream"/>. + /// </remarks> + public sealed class JapaneseKatakanaStemFilter : TokenFilter + { + public readonly static int DEFAULT_MINIMUM_LENGTH = 4; + private readonly static char HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK = '\u30fc'; + + private readonly ICharTermAttribute termAttr; + private readonly IKeywordAttribute keywordAttr; + private readonly int minimumKatakanaLength; + + private readonly static Regex katakanaPattern = new Regex(@"\p{IsKatakana}", RegexOptions.Compiled | RegexOptions.CultureInvariant); + + public JapaneseKatakanaStemFilter(TokenStream input, int minimumLength) + : base(input) + { + this.minimumKatakanaLength = minimumLength; + this.termAttr = AddAttribute<ICharTermAttribute>(); + this.keywordAttr = AddAttribute<IKeywordAttribute>(); + } + + public JapaneseKatakanaStemFilter(TokenStream input) + : this(input, DEFAULT_MINIMUM_LENGTH) + { + } + + public override bool IncrementToken() + { + if (m_input.IncrementToken()) + { + if (!keywordAttr.IsKeyword) + { + termAttr.SetLength(Stem(termAttr.Buffer, termAttr.Length)); + } + return true; + } + else + { + return false; + } + } + + private int Stem(char[] term, int length) + { + if (length < minimumKatakanaLength) + { + return length; + } + + if (!IsKatakana(term, length)) + { + return length; + } + + if (term[length - 1] == HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK) + { + return length - 1; + } + + return length; + } + + private bool IsKatakana(char[] term, int length) + { + for (int i = 0; i < length; i++) + { + // NOTE: Test only identifies full-width characters -- half-widths are supported + if (!katakanaPattern.IsMatch(term[i].ToString())) + { + return false; + } + } + return true; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilterFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilterFactory.cs new file mode 100644 index 0000000..af2acb5 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilterFactory.cs @@ -0,0 +1,61 @@ +using Lucene.Net.Analysis.Util; +using System; +using System.Collections.Generic; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Factory for <see cref="JapaneseKatakanaStemFilter"/>. + /// <code> + /// <fieldType name="text_ja" class="solr.TextField"> + /// <analyzer> + /// <tokenizer class="solr.JapaneseTokenizerFactory"/> + /// <filter class="solr.JapaneseKatakanaStemFilterFactory" + /// minimumLength="4"/> + /// </analyzer> + /// </fieldType> + /// </code> + /// </summary> + public class JapaneseKatakanaStemFilterFactory : TokenFilterFactory + { + private static readonly string MINIMUM_LENGTH_PARAM = "minimumLength"; + private readonly int minimumLength; + + /// <summary>Creates a new <see cref="JapaneseKatakanaStemFilterFactory"/></summary> + public JapaneseKatakanaStemFilterFactory(IDictionary<string, string> args) + : base(args) + { + minimumLength = GetInt32(args, MINIMUM_LENGTH_PARAM, JapaneseKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH); + if (minimumLength < 2) + { + throw new ArgumentException("Illegal " + MINIMUM_LENGTH_PARAM + " " + minimumLength + " (must be 2 or greater)"); + } + if (args.Count > 0) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + public override TokenStream Create(TokenStream input) + { + return new JapaneseKatakanaStemFilter(input, minimumLength); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilter.cs b/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilter.cs new file mode 100644 index 0000000..2b1ccc4 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilter.cs @@ -0,0 +1,61 @@ +using Lucene.Net.Analysis.Ja.TokenAttributes; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Removes tokens that match a set of part-of-speech tags. + /// </summary> + public sealed class JapanesePartOfSpeechStopFilter : FilteringTokenFilter + { + private readonly ISet<string> stopTags; + private readonly IPartOfSpeechAttribute posAtt; + + [Obsolete("EnablePositionIncrements=false is not supported anymore as of Lucene 4.4.")] + public JapanesePartOfSpeechStopFilter(LuceneVersion version, bool enablePositionIncrements, TokenStream input, ISet<string> stopTags) + : base(version, enablePositionIncrements, input) + { + this.stopTags = stopTags; + this.posAtt = AddAttribute<IPartOfSpeechAttribute>(); + } + + /// <summary> + /// Create a new <see cref="JapanesePartOfSpeechStopFilter"/>. + /// </summary> + /// <param name="version">The Lucene match version.</param> + /// <param name="input">The <see cref="TokenStream"/> to consume.</param> + /// <param name="stopTags">The part-of-speech tags that should be removed.</param> + public JapanesePartOfSpeechStopFilter(LuceneVersion version, TokenStream input, ISet<string> stopTags) + : base(version, input) + { + this.stopTags = stopTags; + this.posAtt = AddAttribute<IPartOfSpeechAttribute>(); + } + + protected override bool Accept() + { + string pos = posAtt.GetPartOfSpeech(); + return pos == null || !stopTags.Contains(pos); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilterFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilterFactory.cs new file mode 100644 index 0000000..04fc900 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilterFactory.cs @@ -0,0 +1,85 @@ +using Lucene.Net.Analysis.Util; +using System; +using System.Collections.Generic; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Factory for <see cref="JapanesePartOfSpeechStopFilter"/>. + /// <code> + /// <fieldType name="text_ja" class="solr.TextField"> + /// <analyzer> + /// <tokenizer class="solr.JapaneseTokenizerFactory"/> + /// <filter class="solr.JapanesePartOfSpeechStopFilterFactory" + /// tags="stopTags.txt" + /// enablePositionIncrements="true"/> + /// </analyzer> + /// </fieldType> + /// </code> + /// </summary> + public class JapanesePartOfSpeechStopFilterFactory : TokenFilterFactory, IResourceLoaderAware + { + private readonly string stopTagFiles; + private readonly bool enablePositionIncrements; + private ISet<string> stopTags; + + /// <summary>Creates a new JapanesePartOfSpeechStopFilterFactory</summary> + public JapanesePartOfSpeechStopFilterFactory(IDictionary<string, string> args) + : base(args) + { + stopTagFiles = Get(args, "tags"); + enablePositionIncrements = GetBoolean(args, "enablePositionIncrements", true); + if (args.Count > 0) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + public virtual void Inform(IResourceLoader loader) + { + stopTags = null; + CharArraySet cas = GetWordSet(loader, stopTagFiles, false); + if (cas != null) + { + stopTags = new HashSet<string>(); + foreach (string element in cas) + { + stopTags.Add(element); + } + } + } + + public override TokenStream Create(TokenStream stream) + { + // if stoptags is null, it means the file is empty + if (stopTags != null) + { +#pragma warning disable 612, 618 + TokenStream filter = new JapanesePartOfSpeechStopFilter(m_luceneMatchVersion, enablePositionIncrements, stream, stopTags); +#pragma warning restore 612, 618 + return filter; + } + else + { + return stream; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilter.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilter.cs new file mode 100644 index 0000000..b2e1542 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilter.cs @@ -0,0 +1,89 @@ +using Lucene.Net.Analysis.Ja.TokenAttributes; +using Lucene.Net.Analysis.Ja.Util; +using Lucene.Net.Analysis.TokenAttributes; +using System.Text; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A <see cref="TokenFilter"/> that replaces the term + /// attribute with the reading of a token in either katakana or romaji form. + /// The default reading form is katakana. + /// </summary> + public sealed class JapaneseReadingFormFilter : TokenFilter + { + private readonly ICharTermAttribute termAttr; + private readonly IReadingAttribute readingAttr; + + private StringBuilder buffer = new StringBuilder(); + private bool useRomaji; + + public JapaneseReadingFormFilter(TokenStream input, bool useRomaji) + : base(input) + { + this.useRomaji = useRomaji; + this.termAttr = AddAttribute<ICharTermAttribute>(); + this.readingAttr = AddAttribute<IReadingAttribute>(); + } + + public JapaneseReadingFormFilter(TokenStream input) + : this(input, false) + { + } + + public override bool IncrementToken() + { + if (m_input.IncrementToken()) + { + string reading = readingAttr.GetReading(); + + if (useRomaji) + { + if (reading == null) + { + // if its an OOV term, just try the term text + buffer.Length = 0; + ToStringUtil.GetRomanization(buffer, termAttr.ToString()); + termAttr.SetEmpty().Append(buffer); + } + else + { + buffer.Length = 0; + ToStringUtil.GetRomanization(buffer, reading); + termAttr.SetEmpty().Append(buffer); + } + } + else + { + // just replace the term text with the reading, if it exists + if (reading != null) + { + termAttr.SetEmpty().Append(reading); + } + } + return true; + } + else + { + return false; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilterFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilterFactory.cs new file mode 100644 index 0000000..9464c2e --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilterFactory.cs @@ -0,0 +1,57 @@ +using Lucene.Net.Analysis.Util; +using System; +using System.Collections.Generic; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Factory for <see cref="JapaneseReadingFormFilter"/>. + /// <code> + /// <fieldType name="text_ja" class="solr.TextField"> + /// <analyzer> + /// <tokenizer class="solr.JapaneseTokenizerFactory"/> + /// <filter class="solr.JapaneseReadingFormFilterFactory" + /// useRomaji="false"/> + /// </analyzer> + /// </fieldType> + /// </code> + /// </summary> + public class JapaneseReadingFormFilterFactory : TokenFilterFactory + { + private static readonly string ROMAJI_PARAM = "useRomaji"; + private readonly bool useRomaji; + + /// <summary>Creates a new <see cref="JapaneseReadingFormFilterFactory"/>.</summary> + public JapaneseReadingFormFilterFactory(IDictionary<string, string> args) + : base(args) + { + useRomaji = GetBoolean(args, ROMAJI_PARAM, false); + if (args.Count > 0) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + public override TokenStream Create(TokenStream input) + { + return new JapaneseReadingFormFilter(input, useRomaji); + } + } +}
