[10/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji + tests

nightowl888 Sun, 23 Jul 2017 10:37:18 -0700

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilterFactory.cs 
b/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilterFactory.cs
new file mode 100644
index 0000000..5524be7
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilterFactory.cs
@@ -0,0 +1,52 @@
+ï»¿using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Factory for <see cref="JapaneseBaseFormFilter"/>.
+    /// <code>
+    /// &lt;fieldType name="text_ja" class="solr.TextField"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.JapaneseTokenizerFactory"/&gt;
+    ///     &lt;filter class="solr.JapaneseBaseFormFilterFactory"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;
+    /// </code>
+    /// </summary>
+    public class JapaneseBaseFormFilterFactory : TokenFilterFactory
+    {
+        /// <summary>Creates a new <see 
cref="JapaneseBaseFormFilterFactory"/></summary>
+        public JapaneseBaseFormFilterFactory(IDictionary<string, string> args)
+            : base(args)
+        {
+            if (args.Count > 0)
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override TokenStream Create(TokenStream input)
+        {
+            return new JapaneseBaseFormFilter(input);
+        }
+    }
+}


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilter.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilter.cs 
b/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilter.cs
new file mode 100644
index 0000000..71566bb
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilter.cs
@@ -0,0 +1,500 @@
+ï»¿using Lucene.Net.Analysis.Util;
+using System.Diagnostics;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Normalizes Japanese horizontal iteration marks (odoriji) to their 
expanded form.
+    /// </summary>
+    /// <remarks>
+    /// Sequences of iteration marks are supported.  In case an illegal 
sequence of iteration
+    /// marks is encountered, the implementation emits the illegal source 
character as-is
+    /// without considering its script.  For example, with input 
"&#x003f;&#x309d", we get
+    /// "&#x003f;&#x003f" even though "&#x003f;" isn't hiragana.
+    /// <para/>
+    /// Note that a full stop punctuation character "&#x3002;" (U+3002) can 
not be iterated
+    /// (see below). Iteration marks themselves can be emitted in case they 
are illegal,
+    /// i.e. if they go back past the beginning of the character stream.
+    /// <para/>
+    /// The implementation buffers input until a full stop punctuation 
character (U+3002)
+    /// or EOF is reached in order to not keep a copy of the character stream 
in memory.
+    /// Vertical iteration marks, which are even rarer than horizontal 
iteration marks in
+    /// contemporary Japanese, are unsupported.
+    /// </remarks>
+    public class JapaneseIterationMarkCharFilter : CharFilter
+    {
+        /// <summary>Normalize kanji iteration marks by default</summary>
+        public static readonly bool NORMALIZE_KANJI_DEFAULT = true;
+
+        /// <summary>Normalize kana iteration marks by default</summary>
+        public static readonly bool NORMALIZE_KANA_DEFAULT = true;
+
+        private const char KANJI_ITERATION_MARK = '\u3005';           // ã
+
+        private const char HIRAGANA_ITERATION_MARK = '\u309d';        // ã
+
+        private const char HIRAGANA_VOICED_ITERATION_MARK = '\u309e'; // ã
+
+        private const char KATAKANA_ITERATION_MARK = '\u30fd';        // ã½
+
+        private const char KATAKANA_VOICED_ITERATION_MARK = '\u30fe'; // ã¾
+
+        private const char FULL_STOP_PUNCTUATION = '\u3002';           // ã
+
+        // Hiragana to dakuten map (lookup using code point - 0x30abï¼ãï¼*/
+        private static char[] h2d = new char[50];
+
+        // Katakana to dakuten map (lookup using code point - 0x30abï¼ã«
+        private static char[] k2d = new char[50];
+
+        private readonly RollingCharBuffer buffer = new RollingCharBuffer();
+
+        private int bufferPosition = 0;
+
+        private int iterationMarksSpanSize = 0;
+
+        private int iterationMarkSpanEndPosition = 0;
+
+        private bool normalizeKanji;
+
+        private bool normalizeKana;
+
+        static JapaneseIterationMarkCharFilter()
+        {
+            // Hiragana dakuten map
+            h2d[0] = '\u304c';  // ã => ã
+            h2d[1] = '\u304c';  // ã => ã
+            h2d[2] = '\u304e';  // ã => ã
+            h2d[3] = '\u304e';  // ã => ã
+            h2d[4] = '\u3050';  // ã => ã
+            h2d[5] = '\u3050';  // ã => ã
+            h2d[6] = '\u3052';  // ã => ã
+            h2d[7] = '\u3052';  // ã => ã
+            h2d[8] = '\u3054';  // ã => ã
+            h2d[9] = '\u3054';  // ã => ã
+            h2d[10] = '\u3056'; // ã => ã
+            h2d[11] = '\u3056'; // ã => ã
+            h2d[12] = '\u3058'; // ã => ã
+            h2d[13] = '\u3058'; // ã => ã
+            h2d[14] = '\u305a'; // ã => ã
+            h2d[15] = '\u305a'; // ã => ã
+            h2d[16] = '\u305c'; // ã => ã
+            h2d[17] = '\u305c'; // ã => ã
+            h2d[18] = '\u305e'; // ã => ã
+            h2d[19] = '\u305e'; // ã => ã
+            h2d[20] = '\u3060'; // ã => ã 
+            h2d[21] = '\u3060'; // ã  => ã 
+            h2d[22] = '\u3062'; // ã¡ => ã¢
+            h2d[23] = '\u3062'; // ã¢ => ã¢
+            h2d[24] = '\u3063';
+            h2d[25] = '\u3065'; // ã¤ => ã¥
+            h2d[26] = '\u3065'; // ã¥ => ã¥
+            h2d[27] = '\u3067'; // ã¦ => ã§
+            h2d[28] = '\u3067'; // ã§ => ã§
+            h2d[29] = '\u3069'; // ã¨ => ã©
+            h2d[30] = '\u3069'; // ã© => ã©
+            h2d[31] = '\u306a';
+            h2d[32] = '\u306b';
+            h2d[33] = '\u306c';
+            h2d[34] = '\u306d';
+            h2d[35] = '\u306e';
+            h2d[36] = '\u3070'; // ã¯ => ã°
+            h2d[37] = '\u3070'; // ã° => ã°
+            h2d[38] = '\u3071';
+            h2d[39] = '\u3073'; // ã² => ã³
+            h2d[40] = '\u3073'; // ã³ => ã³
+            h2d[41] = '\u3074';
+            h2d[42] = '\u3076'; // ãµ => ã¶
+            h2d[43] = '\u3076'; // ã¶ => ã¶
+            h2d[44] = '\u3077';
+            h2d[45] = '\u3079'; // ã¸ => ã¹
+            h2d[46] = '\u3079'; // ã¹ => ã¹
+            h2d[47] = '\u307a';
+            h2d[48] = '\u307c'; // ã» => ã¼
+            h2d[49] = '\u307c'; // ã¼ => ã¼
+
+            // Make katakana dakuten map from hiragana map
+            char codePointDifference = (char)('\u30ab' - '\u304b'); // ã« - 
ã
+            Debug.Assert(h2d.Length == k2d.Length);
+            for (int i = 0; i < k2d.Length; i++)
+            {
+                k2d[i] = (char)(h2d[i] + codePointDifference);
+            }
+        }
+
+        /// <summary>
+        /// Constructor. Normalizes both kanji and kana iteration marks by 
default.
+        /// </summary>
+        /// <param name="input">Char stream.</param>
+        public JapaneseIterationMarkCharFilter(TextReader input)
+            : this(input, NORMALIZE_KANJI_DEFAULT, NORMALIZE_KANA_DEFAULT)
+        {
+        }
+
+        /// <summary>
+        /// Constructor
+        /// </summary>
+        /// <param name="input">Char stream.</param>
+        /// <param name="normalizeKanji">Indicates whether kanji iteration 
marks should be normalized.</param>
+        /// <param name="normalizeKana">Indicates whether kana iteration marks 
should be normalized.</param>
+        public JapaneseIterationMarkCharFilter(TextReader input, bool 
normalizeKanji, bool normalizeKana)
+            : base(input)
+        {
+            this.normalizeKanji = normalizeKanji;
+            this.normalizeKana = normalizeKana;
+            buffer.Reset(input);
+        }
+
+        /// <summary>
+        /// Reads a specified maximum number of characters from the current 
reader and writes the data to a buffer, beginning at the specified index.
+        /// </summary>
+        /// <param name="buffer">
+        /// When this method returns, contains the specified character array 
with the values between index and (index + count - 1) 
+        /// replaced by the characters read from the current source.</param>
+        /// <param name="offset">
+        /// The position in buffer at which to begin writing.
+        /// </param>
+        /// <param name="length">
+        /// The maximum number of characters to read. If the end of the reader 
is reached before the specified number of characters is 
+        /// read into the buffer, the method returns.
+        /// </param>
+        /// <returns>
+        /// The number of characters that have been read. The number will be 
less than or equal to count, depending on whether the data is 
+        /// available within the reader. This method returns 0 (zero) if it is 
called when no more characters are left to read.
+        /// </returns>
+        public override int Read(char[] buffer, int offset, int length)
+        {
+            int read = 0;
+
+            for (int i = offset; i < offset + length; i++)
+            {
+                int c = Read();
+                if (c == -1)
+                {
+                    break;
+                }
+                buffer[i] = (char)c;
+                read++;
+            }
+
+            return read == 0 ? -1 : read;
+        }
+
+        /// <summary>
+        /// Reads the next character from the text reader and advances the 
character position by one character.
+        /// </summary>
+        /// <returns>The next character from the text reader, or -1 if no more 
characters are available.</returns>
+        public override int Read()
+        {
+            int ic = buffer.Get(bufferPosition);
+
+            // End of input
+            if (ic == -1)
+            {
+                buffer.FreeBefore(bufferPosition);
+                return ic;
+            }
+
+            char c = (char)ic;
+
+            // Skip surrogate pair characters
+            if (char.IsHighSurrogate(c) || char.IsLowSurrogate(c))
+            {
+                iterationMarkSpanEndPosition = bufferPosition + 1;
+            }
+
+            // Free rolling buffer on full stop
+            if (c == FULL_STOP_PUNCTUATION)
+            {
+                buffer.FreeBefore(bufferPosition);
+                iterationMarkSpanEndPosition = bufferPosition + 1;
+            }
+
+            // Normalize iteration mark
+            if (IsIterationMark(c))
+            {
+                c = NormalizeIterationMark(c);
+            }
+
+            bufferPosition++;
+            return c;
+        }
+
+        /// <summary>
+        /// Normalizes the iteration mark character <paramref name="c"/>
+        /// </summary>
+        /// <param name="c">Iteration mark character to normalize.</param>
+        /// <returns>Normalized iteration mark.</returns>
+        /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
+        private char NormalizeIterationMark(char c)
+        {
+
+            // Case 1: Inside an iteration mark span
+            if (bufferPosition < iterationMarkSpanEndPosition)
+            {
+                return Normalize(SourceCharacter(bufferPosition, 
iterationMarksSpanSize), c);
+            }
+
+            // Case 2: New iteration mark spans starts where the previous one 
ended, which is illegal
+            if (bufferPosition == iterationMarkSpanEndPosition)
+            {
+                // Emit the illegal iteration mark and increase end position 
to indicate that we can't
+                // start a new span on the next position either
+                iterationMarkSpanEndPosition++;
+                return c;
+            }
+
+            // Case 3: New iteration mark span
+            iterationMarksSpanSize = NextIterationMarkSpanSize();
+            iterationMarkSpanEndPosition = bufferPosition + 
iterationMarksSpanSize;
+            return Normalize(SourceCharacter(bufferPosition, 
iterationMarksSpanSize), c);
+        }
+
+        /// <summary>
+        /// Finds the number of subsequent next iteration marks
+        /// </summary>
+        /// <returns>Number of iteration marks starting at the current buffer 
position.</returns>
+        /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
+        private int NextIterationMarkSpanSize()
+        {
+            int spanSize = 0;
+            for (int i = bufferPosition; buffer.Get(i) != -1 && 
IsIterationMark((char)(buffer.Get(i))); i++)
+            {
+                spanSize++;
+            }
+            // Restrict span size so that we don't go past the previous end 
position
+            if (bufferPosition - spanSize < iterationMarkSpanEndPosition)
+            {
+                spanSize = bufferPosition - iterationMarkSpanEndPosition;
+            }
+            return spanSize;
+        }
+
+        /// <summary>
+        /// Returns the source character for a given position and iteration 
mark span size.
+        /// </summary>
+        /// <param name="position">Buffer position (should not exceed 
bufferPosition).</param>
+        /// <param name="spanSize">Iteration mark span size.</param>
+        /// <returns>Source character.</returns>
+        /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
+        private char SourceCharacter(int position, int spanSize)
+        {
+            return (char)buffer.Get(position - spanSize);
+        }
+
+        /// <summary>
+        /// Normalize a character.
+        /// </summary>
+        /// <param name="c">Character to normalize.</param>
+        /// <param name="m">Repetition mark referring to <paramref 
name="c"/>.</param>
+        /// <returns>Normalized character - return c on illegal iteration 
marks.</returns>
+        private char Normalize(char c, char m)
+        {
+            if (IsHiraganaIterationMark(m))
+            {
+                return NormalizedHiragana(c, m);
+            }
+
+            if (IsKatakanaIterationMark(m))
+            {
+                return NormalizedKatakana(c, m);
+            }
+
+            return c; // If m is not kana and we are to normalize it, we 
assume it is kanji and simply return it
+        }
+
+        /// <summary>
+        /// Normalize hiragana character.
+        /// </summary>
+        /// <param name="c">Hiragana character.</param>
+        /// <param name="m">Repetition mark referring to <paramref 
name="c"/>.</param>
+        /// <returns>Normalized character - return <paramref name="c"/> on 
illegal iteration marks.</returns>
+        private char NormalizedHiragana(char c, char m)
+        {
+            switch (m)
+            {
+                case HIRAGANA_ITERATION_MARK:
+                    return IsHiraganaDakuten(c) ? (char)(c - 1) : c;
+                case HIRAGANA_VOICED_ITERATION_MARK:
+                    return LookupHiraganaDakuten(c);
+                default:
+                    return c;
+            }
+        }
+
+        /// <summary>
+        /// Normalize katakana character.
+        /// </summary>
+        /// <param name="c">Katakana character.</param>
+        /// <param name="m">Repetition mark referring to <paramref 
name="c"/>.</param>
+        /// <returns>Normalized character - return <paramref name="c"/> on 
illegal iteration marks.</returns>
+        private char NormalizedKatakana(char c, char m)
+        {
+            switch (m)
+            {
+                case KATAKANA_ITERATION_MARK:
+                    return IsKatakanaDakuten(c) ? (char)(c - 1) : c;
+                case KATAKANA_VOICED_ITERATION_MARK:
+                    return LookupKatakanaDakuten(c);
+                default:
+                    return c;
+            }
+        }
+
+        /// <summary>
+        /// Iteration mark character predicate.
+        /// </summary>
+        /// <param name="c">Character to test.</param>
+        /// <returns><c>true</c> if <paramref name="c"/> is an iteration mark 
character.  Otherwise <c>false</c>.</returns>
+        private bool IsIterationMark(char c)
+        {
+            return IsKanjiIterationMark(c) || IsHiraganaIterationMark(c) || 
IsKatakanaIterationMark(c);
+        }
+
+        /// <summary>
+        /// Hiragana iteration mark character predicate.
+        /// </summary>
+        /// <param name="c">Character to test.</param>
+        /// <returns><c>true</c> if <paramref name="c"/> is a hiragana 
iteration mark character.  Otherwise <c>false</c>.</returns>
+        private bool IsHiraganaIterationMark(char c)
+        {
+            if (normalizeKana)
+            {
+                return c == HIRAGANA_ITERATION_MARK || c == 
HIRAGANA_VOICED_ITERATION_MARK;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        /// <summary>
+        /// Katakana iteration mark character predicate.
+        /// </summary>
+        /// <param name="c">Character to test.</param>
+        /// <returns><c>true</c> if c is a katakana iteration mark character.  
Otherwise <c>false</c>.</returns>
+        private bool IsKatakanaIterationMark(char c)
+        {
+            if (normalizeKana)
+            {
+                return c == KATAKANA_ITERATION_MARK || c == 
KATAKANA_VOICED_ITERATION_MARK;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        /// <summary>
+        /// Kanji iteration mark character predicate.
+        /// </summary>
+        /// <param name="c">Character to test.</param>
+        /// <returns><c>true</c> if c is a kanji iteration mark character.  
Otherwise <c>false</c>.</returns>
+        private bool IsKanjiIterationMark(char c)
+        {
+            if (normalizeKanji)
+            {
+                return c == KANJI_ITERATION_MARK;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        /// <summary>
+        /// Look up hiragana dakuten.
+        /// </summary>
+        /// <param name="c">Character to look up.</param>
+        /// <returns>Hiragana dakuten variant of c or c itself if no dakuten 
variant exists.</returns>
+        private char LookupHiraganaDakuten(char c)
+        {
+            return Lookup(c, h2d, '\u304b'); // Code point is for ã
+        }
+
+        /// <summary>
+        /// Look up katakana dakuten. Only full-width katakana are supported.
+        /// </summary>
+        /// <param name="c">Character to look up.</param>
+        /// <returns>Katakana dakuten variant of <paramref name="c"/> or 
<paramref name="c"/> itself if no dakuten variant exists.</returns>
+        private char LookupKatakanaDakuten(char c)
+        {
+            return Lookup(c, k2d, '\u30ab'); // Code point is for ã«
+        }
+
+        /// <summary>
+        /// Hiragana dakuten predicate.
+        /// </summary>
+        /// <param name="c">Character to check.</param>
+        /// <returns><c>true</c> if c is a hiragana dakuten and otherwise 
<c>false</c>.</returns>
+        private bool IsHiraganaDakuten(char c)
+        {
+            return Inside(c, h2d, '\u304b') && c == LookupHiraganaDakuten(c);
+        }
+
+        /// <summary>
+        /// Katakana dakuten predicate.
+        /// </summary>
+        /// <param name="c">Character to check.</param>
+        /// <returns><c>true</c> if c is a hiragana dakuten and otherwise 
<c>false</c>.</returns>
+        private bool IsKatakanaDakuten(char c)
+        {
+            return Inside(c, k2d, '\u30ab') && c == LookupKatakanaDakuten(c);
+        }
+
+        /// <summary>
+        /// Looks up a character in dakuten map and returns the dakuten 
variant if it exists.
+        /// Otherwise return the character being looked up itself.
+        /// </summary>
+        /// <param name="c">Character to look up.</param>
+        /// <param name="map">Dakuten map.</param>
+        /// <param name="offset">Code point offset from <paramref 
name="c"/>.</param>
+        /// <returns>Mapped character or <paramref name="c"/> if no mapping 
exists.</returns>
+        private char Lookup(char c, char[] map, char offset)
+        {
+            if (!Inside(c, map, offset))
+            {
+                return c;
+            }
+            else
+            {
+                return map[c - offset];
+            }
+        }
+
+        /// <summary>
+        /// Predicate indicating if the lookup character is within dakuten map 
range.
+        /// </summary>
+        /// <param name="c">Character to look up.</param>
+        /// <param name="map">Dakuten map.</param>
+        /// <param name="offset">Code point offset from <paramref 
name="c"/>.</param>
+        /// <returns><c>true</c> if <paramref name="c"/> is mapped by map and 
otherwise <c>false</c>.</returns>
+        private bool Inside(char c, char[] map, char offset)
+        {
+            return c >= offset && c < offset + map.Length;
+        }
+
+        protected override int Correct(int currentOff)
+        {
+            return currentOff; // this filter doesn't change the length of 
strings
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilterFactory.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilterFactory.cs 
b/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilterFactory.cs
new file mode 100644
index 0000000..c9518c9
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilterFactory.cs
@@ -0,0 +1,66 @@
+ï»¿using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Factory for <see cref="JapaneseIterationMarkCharFilter"/>.
+    /// <code>
+    /// &lt;fieldType name="text_ja" class="solr.TextField" 
positionIncrementGap="100" autoGeneratePhraseQueries="false"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;charFilter class="solr.JapaneseIterationMarkCharFilterFactory 
normalizeKanji="true" normalizeKana="true"/&gt;
+    ///     &lt;tokenizer class="solr.JapaneseTokenizerFactory"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;
+    /// </code>
+    /// </summary>
+    public class JapaneseIterationMarkCharFilterFactory : CharFilterFactory, 
IMultiTermAwareComponent
+    {
+        private static readonly string NORMALIZE_KANJI_PARAM = 
"normalizeKanji";
+        private static readonly string NORMALIZE_KANA_PARAM = "normalizeKana";
+
+        private readonly bool normalizeKanji;
+        private readonly bool normalizeKana;
+
+        /// <summary>Creates a new <see 
cref="JapaneseIterationMarkCharFilterFactory"/></summary>
+        public JapaneseIterationMarkCharFilterFactory(IDictionary<string, 
string> args)
+            : base(args)
+        {
+            normalizeKanji = GetBoolean(args, NORMALIZE_KANJI_PARAM, 
JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT);
+            normalizeKana = GetBoolean(args, NORMALIZE_KANA_PARAM, 
JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
+            if (args.Count > 0)
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override TextReader Create(TextReader input)
+        {
+            return new JapaneseIterationMarkCharFilter(input, normalizeKanji, 
normalizeKana);
+        }
+
+        public virtual AbstractAnalysisFactory GetMultiTermComponent()
+        {
+            return this;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilter.cs 
b/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilter.cs
new file mode 100644
index 0000000..857e5bf
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilter.cs
@@ -0,0 +1,111 @@
+ï»¿using Lucene.Net.Analysis.TokenAttributes;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// A <see cref="TokenFilter"/> that normalizes common katakana spelling 
variations
+    /// ending in a long sound character by removing this character (U+30FC).  
Only
+    /// katakana words longer than a minimum length are stemmed (default is 
four).
+    /// </summary>
+    /// <remarks>
+    /// Note that only full-width katakana characters are supported.  Please 
use a
+    /// <see cref="Cjk.CJKWidthFilter"/> to convert half-width
+    /// katakana to full-width before using this filter.
+    /// <para/>
+    /// In order to prevent terms from being stemmed, use an instance of
+    /// <see cref="Miscellaneous.SetKeywordMarkerFilter"/>
+    /// or a custom <see cref="TokenFilter"/> that sets the <see 
cref="IKeywordAttribute"/>
+    /// before this <see cref="TokenStream"/>.
+    /// </remarks>
+    public sealed class JapaneseKatakanaStemFilter : TokenFilter
+    {
+        public readonly static int DEFAULT_MINIMUM_LENGTH = 4;
+        private readonly static char HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK = 
'\u30fc';
+
+        private readonly ICharTermAttribute termAttr;
+        private readonly IKeywordAttribute keywordAttr;
+        private readonly int minimumKatakanaLength;
+
+        private readonly static Regex katakanaPattern = new 
Regex(@"\p{IsKatakana}", RegexOptions.Compiled | RegexOptions.CultureInvariant);
+
+        public JapaneseKatakanaStemFilter(TokenStream input, int minimumLength)
+            : base(input)
+        {
+            this.minimumKatakanaLength = minimumLength;
+            this.termAttr = AddAttribute<ICharTermAttribute>();
+            this.keywordAttr = AddAttribute<IKeywordAttribute>();
+        }
+
+        public JapaneseKatakanaStemFilter(TokenStream input)
+            : this(input, DEFAULT_MINIMUM_LENGTH)
+        {
+        }
+
+        public override bool IncrementToken()
+        {
+            if (m_input.IncrementToken())
+            {
+                if (!keywordAttr.IsKeyword)
+                {
+                    termAttr.SetLength(Stem(termAttr.Buffer, termAttr.Length));
+                }
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        private int Stem(char[] term, int length)
+        {
+            if (length < minimumKatakanaLength)
+            {
+                return length;
+            }
+
+            if (!IsKatakana(term, length))
+            {
+                return length;
+            }
+
+            if (term[length - 1] == HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK)
+            {
+                return length - 1;
+            }
+
+            return length;
+        }
+
+        private bool IsKatakana(char[] term, int length)
+        {
+            for (int i = 0; i < length; i++)
+            {
+                // NOTE: Test only identifies full-width characters -- 
half-widths are supported
+                if (!katakanaPattern.IsMatch(term[i].ToString()))
+                {
+                    return false;
+                }
+            }
+            return true;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilterFactory.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilterFactory.cs 
b/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilterFactory.cs
new file mode 100644
index 0000000..af2acb5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilterFactory.cs
@@ -0,0 +1,61 @@
+ï»¿using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Factory for <see cref="JapaneseKatakanaStemFilter"/>.
+    /// <code>
+    /// &lt;fieldType name="text_ja" class="solr.TextField"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.JapaneseTokenizerFactory"/&gt;
+    ///     &lt;filter class="solr.JapaneseKatakanaStemFilterFactory"
+    ///             minimumLength="4"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;
+    /// </code>
+    /// </summary>
+    public class JapaneseKatakanaStemFilterFactory : TokenFilterFactory
+    {
+        private static readonly string MINIMUM_LENGTH_PARAM = "minimumLength";
+        private readonly int minimumLength;
+
+        /// <summary>Creates a new <see 
cref="JapaneseKatakanaStemFilterFactory"/></summary>
+        public JapaneseKatakanaStemFilterFactory(IDictionary<string, string> 
args)
+            : base(args)
+        {
+            minimumLength = GetInt32(args, MINIMUM_LENGTH_PARAM, 
JapaneseKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH);
+            if (minimumLength < 2)
+            {
+                throw new ArgumentException("Illegal " + MINIMUM_LENGTH_PARAM 
+ " " + minimumLength + " (must be 2 or greater)");
+            }
+            if (args.Count > 0)
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override TokenStream Create(TokenStream input)
+        {
+            return new JapaneseKatakanaStemFilter(input, minimumLength);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilter.cs 
b/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilter.cs
new file mode 100644
index 0000000..2b1ccc4
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilter.cs
@@ -0,0 +1,61 @@
+ï»¿using Lucene.Net.Analysis.Ja.TokenAttributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Removes tokens that match a set of part-of-speech tags.
+    /// </summary>
+    public sealed class JapanesePartOfSpeechStopFilter : FilteringTokenFilter
+    {
+        private readonly ISet<string> stopTags;
+        private readonly IPartOfSpeechAttribute posAtt;
+
+        [Obsolete("EnablePositionIncrements=false is not supported anymore as 
of Lucene 4.4.")]
+        public JapanesePartOfSpeechStopFilter(LuceneVersion version, bool 
enablePositionIncrements, TokenStream input, ISet<string> stopTags)
+                  : base(version, enablePositionIncrements, input)
+        {
+            this.stopTags = stopTags;
+            this.posAtt = AddAttribute<IPartOfSpeechAttribute>();
+        }
+
+        /// <summary>
+        /// Create a new <see cref="JapanesePartOfSpeechStopFilter"/>.
+        /// </summary>
+        /// <param name="version">The Lucene match version.</param>
+        /// <param name="input">The <see cref="TokenStream"/> to 
consume.</param>
+        /// <param name="stopTags">The part-of-speech tags that should be 
removed.</param>
+        public JapanesePartOfSpeechStopFilter(LuceneVersion version, 
TokenStream input, ISet<string> stopTags)
+            : base(version, input)
+        {
+            this.stopTags = stopTags;
+            this.posAtt = AddAttribute<IPartOfSpeechAttribute>();
+        }
+
+        protected override bool Accept()
+        {
+            string pos = posAtt.GetPartOfSpeech();
+            return pos == null || !stopTags.Contains(pos);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilterFactory.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilterFactory.cs 
b/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilterFactory.cs
new file mode 100644
index 0000000..04fc900
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilterFactory.cs
@@ -0,0 +1,85 @@
+ï»¿using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Factory for <see cref="JapanesePartOfSpeechStopFilter"/>.
+    /// <code>
+    /// &lt;fieldType name="text_ja" class="solr.TextField"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.JapaneseTokenizerFactory"/&gt;
+    ///     &lt;filter class="solr.JapanesePartOfSpeechStopFilterFactory"
+    ///             tags="stopTags.txt" 
+    ///             enablePositionIncrements="true"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;
+    /// </code>
+    /// </summary>
+    public class JapanesePartOfSpeechStopFilterFactory : TokenFilterFactory, 
IResourceLoaderAware
+    {
+        private readonly string stopTagFiles;
+        private readonly bool enablePositionIncrements;
+        private ISet<string> stopTags;
+
+        /// <summary>Creates a new 
JapanesePartOfSpeechStopFilterFactory</summary>
+        public JapanesePartOfSpeechStopFilterFactory(IDictionary<string, 
string> args)
+            : base(args)
+        {
+            stopTagFiles = Get(args, "tags");
+            enablePositionIncrements = GetBoolean(args, 
"enablePositionIncrements", true);
+            if (args.Count > 0)
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public virtual void Inform(IResourceLoader loader)
+        {
+            stopTags = null;
+            CharArraySet cas = GetWordSet(loader, stopTagFiles, false);
+            if (cas != null)
+            {
+                stopTags = new HashSet<string>();
+                foreach (string element in cas) 
+                {
+                    stopTags.Add(element);
+                }
+            }
+        }
+
+        public override TokenStream Create(TokenStream stream)
+        {
+            // if stoptags is null, it means the file is empty
+            if (stopTags != null)
+            {
+#pragma warning disable 612, 618
+                TokenStream filter = new 
JapanesePartOfSpeechStopFilter(m_luceneMatchVersion, enablePositionIncrements, 
stream, stopTags);
+#pragma warning restore 612, 618
+                return filter;
+            }
+            else
+            {
+                return stream;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilter.cs 
b/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilter.cs
new file mode 100644
index 0000000..b2e1542
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilter.cs
@@ -0,0 +1,89 @@
+ï»¿using Lucene.Net.Analysis.Ja.TokenAttributes;
+using Lucene.Net.Analysis.Ja.Util;
+using Lucene.Net.Analysis.TokenAttributes;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// A <see cref="TokenFilter"/> that replaces the term
+    /// attribute with the reading of a token in either katakana or romaji 
form.
+    /// The default reading form is katakana.
+    /// </summary>
+    public sealed class JapaneseReadingFormFilter : TokenFilter
+    {
+        private readonly ICharTermAttribute termAttr;
+        private readonly IReadingAttribute readingAttr;
+
+        private StringBuilder buffer = new StringBuilder();
+        private bool useRomaji;
+
+        public JapaneseReadingFormFilter(TokenStream input, bool useRomaji)
+            : base(input)
+        {
+            this.useRomaji = useRomaji;
+            this.termAttr = AddAttribute<ICharTermAttribute>();
+            this.readingAttr = AddAttribute<IReadingAttribute>();
+        }
+
+        public JapaneseReadingFormFilter(TokenStream input)
+            : this(input, false)
+        {
+        }
+
+        public override bool IncrementToken()
+        {
+            if (m_input.IncrementToken())
+            {
+                string reading = readingAttr.GetReading();
+
+                if (useRomaji)
+                {
+                    if (reading == null)
+                    {
+                        // if its an OOV term, just try the term text
+                        buffer.Length = 0;
+                        ToStringUtil.GetRomanization(buffer, 
termAttr.ToString());
+                        termAttr.SetEmpty().Append(buffer);
+                    }
+                    else
+                    {
+                        buffer.Length = 0;
+                        ToStringUtil.GetRomanization(buffer, reading);
+                        termAttr.SetEmpty().Append(buffer);
+                    }
+                }
+                else
+                {
+                    // just replace the term text with the reading, if it 
exists
+                    if (reading != null)
+                    {
+                        termAttr.SetEmpty().Append(reading);
+                    }
+                }
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilterFactory.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilterFactory.cs 
b/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilterFactory.cs
new file mode 100644
index 0000000..9464c2e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilterFactory.cs
@@ -0,0 +1,57 @@
+ï»¿using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Factory for <see cref="JapaneseReadingFormFilter"/>.
+    /// <code>
+    /// &lt;fieldType name="text_ja" class="solr.TextField"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.JapaneseTokenizerFactory"/&gt;
+    ///     &lt;filter class="solr.JapaneseReadingFormFilterFactory"
+    ///             useRomaji="false"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;
+    /// </code>
+    /// </summary>
+    public class JapaneseReadingFormFilterFactory : TokenFilterFactory
+    {
+        private static readonly string ROMAJI_PARAM = "useRomaji";
+        private readonly bool useRomaji;
+
+        /// <summary>Creates a new <see 
cref="JapaneseReadingFormFilterFactory"/>.</summary>
+        public JapaneseReadingFormFilterFactory(IDictionary<string, string> 
args)
+            : base(args)
+        {
+            useRomaji = GetBoolean(args, ROMAJI_PARAM, false);
+            if (args.Count > 0)
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override TokenStream Create(TokenStream input)
+        {
+            return new JapaneseReadingFormFilter(input, useRomaji);
+        }
+    }
+}

[10/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji + tests

Reply via email to