[04/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji + tests

nightowl888 Sun, 23 Jul 2017 10:36:50 -0700

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilter.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilter.cs 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilter.cs
new file mode 100644
index 0000000..609803f
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilter.cs
@@ -0,0 +1,84 @@
+ï»¿using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Miscellaneous;
+using Lucene.Net.Analysis.Util;
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class TestJapaneseBaseFormFilter : BaseTokenStreamTestCase
+    {
+        private Analyzer analyzer = Analyzer.NewAnonymous(createComponents: 
(fieldName, reader) =>
+        {
+            Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, 
JapaneseTokenizer.DEFAULT_MODE);
+            return new TokenStreamComponents(tokenizer, new 
JapaneseBaseFormFilter(tokenizer));
+        });
+
+
+        [Test]
+        public void TestBasics()
+        {
+            AssertAnalyzesTo(analyzer, "ããã¯ã¾ã 
å®é¨æ®µéã«ããã¾ã",
+            new String[] { "ãã", "ã¯", "ã¾ã ", "å®é¨", "æ®µé", 
"ã«", "ãã", "ã¾ã" }
+        );
+        }
+
+        [Test]
+        public void TestKeyword()
+        {
+            CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, 
AsSet("ãã"), false);
+            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, 
reader) =>
+            {
+                Tokenizer source = new JapaneseTokenizer(reader, null, true, 
JapaneseTokenizer.DEFAULT_MODE);
+                TokenStream sink = new SetKeywordMarkerFilter(source, 
exclusionSet);
+                return new TokenStreamComponents(source, new 
JapaneseBaseFormFilter(sink));
+            });
+
+            AssertAnalyzesTo(a, "ããã¯ã¾ã å®é¨æ®µéã«ããã¾ã",
+                new String[] { "ãã", "ã¯", "ã¾ã ", "å®é¨", "æ®µé", 
"ã«", "ãã", "ã¾ã" }
+            );
+        }
+
+        [Test]
+        public void TestEnglish()
+        {
+            AssertAnalyzesTo(analyzer, "this atest",
+                new String[] { "this", "atest" });
+        }
+
+        [Test]
+        public void TestRandomStrings()
+        {
+            CheckRandomData(Random(), analyzer, AtLeast(1000));
+        }
+
+        [Test]
+        public void TestEmptyTerm()
+        {
+            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, 
reader) =>
+            {
+                Tokenizer tokenizer = new KeywordTokenizer(reader);
+                return new TokenStreamComponents(tokenizer, new 
JapaneseBaseFormFilter(tokenizer));
+            });
+
+            CheckOneTerm(a, "", "");
+        }
+    }
+}


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilterFactory.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilterFactory.cs 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilterFactory.cs
new file mode 100644
index 0000000..61a8b2e
--- /dev/null
+++ 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilterFactory.cs
@@ -0,0 +1,60 @@
+ï»¿using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Simple tests for <see cref="JapaneseBaseFormFilterFactory"/>
+    /// </summary>
+    public class TestJapaneseBaseFormFilterFactory : BaseTokenStreamTestCase
+    {
+        [Test]
+        public void TestBasics()
+        {
+            JapaneseTokenizerFactory tokenizerFactory = new 
JapaneseTokenizerFactory(new Dictionary<String, String>());
+            tokenizerFactory.Inform(new StringMockResourceLoader(""));
+            TokenStream ts = tokenizerFactory.Create(new 
StringReader("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ã"));
+            JapaneseBaseFormFilterFactory factory = new 
JapaneseBaseFormFilterFactory(new Dictionary<String, String>());
+            ts = factory.Create(ts);
+            AssertTokenStreamContents(ts,
+                new String[] { "ãã", "ã¯", "ã¾ã ", "å®é¨", "æ®µé", 
"ã«", "ãã", "ã¾ã" }
+            );
+        }
+
+        /** Test that bogus arguments result in exception */
+        [Test]
+        public void TestBogusArguments()
+        {
+            try
+            {
+                new JapaneseBaseFormFilterFactory(new Dictionary<String, 
String>() {
+                    { "bogusArg", "bogusValue" }
+                });
+                fail();
+            }
+            catch (ArgumentException expected)
+            {
+                assertTrue(expected.Message.Contains("Unknown parameters"));
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilter.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilter.cs 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilter.cs
new file mode 100644
index 0000000..9db0903
--- /dev/null
+++ 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilter.cs
@@ -0,0 +1,241 @@
+ï»¿using NUnit.Framework;
+using System;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class TestJapaneseIterationMarkCharFilter : BaseTokenStreamTestCase
+    {
+        private Analyzer keywordAnalyzer = 
Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+        {
+            Tokenizer tokenizer = new MockTokenizer(reader, 
MockTokenizer.KEYWORD, false);
+            return new TokenStreamComponents(tokenizer, tokenizer);
+        },
+            initReader: (fieldName, reader) =>
+            {
+                return new JapaneseIterationMarkCharFilter(reader);
+            });
+
+
+        private Analyzer japaneseAnalyzer = 
Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+        {
+            Tokenizer tokenizer = new JapaneseTokenizer(reader, null, false, 
JapaneseTokenizerMode.SEARCH);
+            return new TokenStreamComponents(tokenizer, tokenizer);
+        },
+            initReader: (fieldName, reader) =>
+            {
+                return new JapaneseIterationMarkCharFilter(reader);
+            });
+
+        [Test]
+        public void TestKanji()
+        {
+            // Test single repetition
+            AssertAnalyzesTo(keywordAnalyzer, "æã", new String[] { 
"ææ" });
+            AssertAnalyzesTo(japaneseAnalyzer, "æã", new String[] { 
"ææ" });
+
+            // Test multiple repetitions
+            AssertAnalyzesTo(keywordAnalyzer, "é¦¬é¹¿ãããã", new 
String[] { "é¦¬é¹¿é¦¬é¹¿ãã" });
+            AssertAnalyzesTo(japaneseAnalyzer, "é¦¬é¹¿ãããã", new 
String[] { "é¦¬é¹¿é¦¬é¹¿ãã" });
+        }
+
+        [Test]
+        public void TestKatakana()
+        {
+            // Test single repetition
+            AssertAnalyzesTo(keywordAnalyzer, "ãã¹ã¾", new String[] { 
"ãã¹ãº" });
+            AssertAnalyzesTo(japaneseAnalyzer, "ãã¹ã¾", new String[] { 
"ã", "ã¹ãº" }); // Side effect
+        }
+
+        [Test]
+        public void testHiragana()
+        {
+            // Test single unvoiced iteration
+            AssertAnalyzesTo(keywordAnalyzer, "ããã®", new String[] { 
"ããã®" });
+            AssertAnalyzesTo(japaneseAnalyzer, "ããã®", new String[] { 
"ã", "ãã®" }); // Side effect
+
+            // Test single voiced iteration
+            AssertAnalyzesTo(keywordAnalyzer, "ã¿ãã", new String[] { 
"ã¿ãã" });
+            AssertAnalyzesTo(japaneseAnalyzer, "ã¿ãã", new String[] { 
"ã¿ãã" });
+
+            // Test single voiced iteration
+            AssertAnalyzesTo(keywordAnalyzer, "ãã", new String[] { 
"ãã" });
+            AssertAnalyzesTo(japaneseAnalyzer, "ãã", new String[] { 
"ãã" });
+
+            // Test single unvoiced iteration with voiced iteration
+            AssertAnalyzesTo(keywordAnalyzer, "ãã", new String[] { 
"ãã" });
+            AssertAnalyzesTo(japaneseAnalyzer, "ãã", new String[] { 
"ãã" });
+
+            // Test multiple repetitions with voiced iteration
+            AssertAnalyzesTo(keywordAnalyzer, "ã¨ããããã", new 
String[] { "ã¨ããã©ãã" });
+            AssertAnalyzesTo(japaneseAnalyzer, "ã¨ããããã", new 
String[] { "ã¨ããã©ãã" });
+        }
+
+        [Test]
+        public void TestMalformed()
+        {
+            // We can't iterate c here, so emit as it is
+            AssertAnalyzesTo(keywordAnalyzer, "abcã¨ãããããã", new 
String[] { "abcã¨ããcã¨ãã" });
+
+            // We can't iterate c (with dakuten change) here, so emit it as-is
+            AssertAnalyzesTo(keywordAnalyzer, "abcã¨ãããããã", new 
String[] { "abcã¨ããcã¨ãã" });
+
+            // We can't iterate before beginning of stream, so emit characters 
as-is
+            AssertAnalyzesTo(keywordAnalyzer, "ã¨ãããããããã", 
new String[] { "ã¨ããã©ããããã" });
+
+            // We can't iterate an iteration mark only, so emit as-is
+            AssertAnalyzesTo(keywordAnalyzer, "ã", new String[] { "ã" });
+            AssertAnalyzesTo(keywordAnalyzer, "ã", new String[] { "ã" });
+            AssertAnalyzesTo(keywordAnalyzer, "ãã", new String[] { 
"ãã" });
+
+            // We can't iterate a full stop punctuation mark (because we use 
it as a flush marker)
+            AssertAnalyzesTo(keywordAnalyzer, "ãã", new String[] { 
"ãã" });
+            AssertAnalyzesTo(keywordAnalyzer, "ãããã", new String[] { 
"ãããã" });
+
+            // We can iterate other punctuation marks
+            AssertAnalyzesTo(keywordAnalyzer, "ï¼ã", new String[] { 
"ï¼ï¼" });
+
+            // We can not get a dakuten variant of ã½ -- this is also a 
corner case test for inside()
+            AssertAnalyzesTo(keywordAnalyzer, "ããã½ãã¤ãã´", new 
String[] { "ããã½ã½ã¤ãã´" });
+            AssertAnalyzesTo(keywordAnalyzer, "ããã½ãã¤ãã´", new 
String[] { "ããã½ã½ã¤ãã´" });
+        }
+
+        [Test]
+        public void TestEmpty()
+        {
+            // Empty input stays empty
+            AssertAnalyzesTo(keywordAnalyzer, "", new String[0]);
+            AssertAnalyzesTo(japaneseAnalyzer, "", new String[0]);
+        }
+
+        [Test]
+        public void TestFullStop()
+        {
+            // Test full stops   
+            AssertAnalyzesTo(keywordAnalyzer, "ã", new String[] { "ã" });
+            AssertAnalyzesTo(keywordAnalyzer, "ãã", new String[] { 
"ãã" });
+            AssertAnalyzesTo(keywordAnalyzer, "ããã", new String[] { 
"ããã" });
+        }
+
+        [Test]
+        public void TestKanjiOnly()
+        {
+            // Test kanji only repetition marks
+            CharFilter filter = new JapaneseIterationMarkCharFilter(
+                new StringReader("æã
ãããã®ããã¨ä¸ç·ã«ãå¯¿å¸ãé£ã¹ããã§ããabcã¨ãããããã"),
+                true, // kanji
+                false // no kana
+            );
+            assertCharFilterEquals(filter, 
"ææãããã®ããã¨ä¸ç·ã«ãå¯¿å¸ãé£ã¹ããã§ããabcã¨ãããããã");
+        }
+
+        [Test]
+        public void TestKanaOnly()
+        {
+            // Test kana only repetition marks
+            CharFilter filter = new JapaneseIterationMarkCharFilter(
+                new StringReader("æã
ãããã®ããã¨ä¸ç·ã«ãå¯¿å¸ãé£ã¹ããã§ããabcã¨ãããããã"),
+                false, // no kanji
+                true   // kana
+            );
+            assertCharFilterEquals(filter, "æã
ãããã®ããã¨ä¸ç·ã«ãå¯¿å¸ãé£ã¹ããã§ããabcã¨ããã©ããã");
+        }
+
+        [Test]
+        public void TestNone()
+        {
+            // Test no repetition marks
+            CharFilter filter = new JapaneseIterationMarkCharFilter(
+                new StringReader("æã
ãããã®ããã¨ä¸ç·ã«ãå¯¿å¸ãé£ã¹ããã§ããabcã¨ãããããã"),
+                false, // no kanji
+                false  // no kana
+            );
+            assertCharFilterEquals(filter, "æã
ãããã®ããã¨ä¸ç·ã«ãå¯¿å¸ãé£ã¹ããã§ããabcã¨ãããããã");
+        }
+
+        [Test]
+        public void TestCombinations()
+        {
+            AssertAnalyzesTo(keywordAnalyzer, "æã
ãããã®ããã¨ä¸ç·ã«ãå¯¿å¸ãé£ã¹ã«è¡ãã¾ãã",
+                new String[] { 
"ææãããã®ããã¨ä¸ç·ã«ãå¯¿å¸ãé£ã¹ã«è¡ãã¾ãã" }
+            );
+        }
+
+        [Test]
+        public void TestHiraganaCoverage()
+        {
+            // Test all hiragana iteration variants
+            String source = 
"ããããããããããããããããããããããããããããããããããããããããããã
 
ãã¡ãã¢ãã¤ãã¥ãã¦ãã§ãã¨ãã©ãã¯ãã°ãã²ãã³ããµãã¶ãã¸ãã¹ãã»ãã¼ã";
+            String target = 
"ããããããããããããããããããããããããããããããããããããããããããã
 
ãã¡ã¡ã¢ã¡ã¤ã¤ã¥ã¤ã¦ã¦ã§ã¦ã¨ã¨ã©ã¨ã¯ã¯ã°ã¯ã²ã²ã³ã²ãµãµã¶ãµã¸ã¸ã¹ã¸ã»ã»ã¼ã»";
+            AssertAnalyzesTo(keywordAnalyzer, source, new String[] { target });
+
+            // Test all hiragana iteration variants with dakuten
+            source = 
"ããããããããããããããããããããããããããããããããããããããããããã
 
ãã¡ãã¢ãã¤ãã¥ãã¦ãã§ãã¨ãã©ãã¯ãã°ãã²ãã³ããµãã¶ãã¸ãã¹ãã»ãã¼ã";
+            target = 
"ãããããããããããããããããããããããããããããããããããããããããã
 ã ã 
ã¡ã¢ã¢ã¢ã¤ã¥ã¥ã¥ã¦ã§ã§ã§ã¨ã©ã©ã©ã¯ã°ã°ã°ã²ã³ã³ã³ãµã¶ã¶ã¶ã¸ã¹ã¹ã¹ã»ã¼ã¼ã¼";
+            AssertAnalyzesTo(keywordAnalyzer, source, new String[] { target });
+        }
+
+        [Test]
+        public void TestKatakanaCoverage()
+        {
+            // Test all katakana iteration variants
+            String source = 
"ã«ã½ã¬ã½ãã½ã®ã½ã¯ã½ã°ã½ã±ã½ã²ã½ã³ã½ã´ã½ãµã½ã¶ã½ã·ã½ã¸ã½ã¹ã½ãºã½ã»ã½ã¼ã½ã½ã½ã¾ã½ã¿ã½ãã½ãã½ãã½ãã½ã

ã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½";
+            String target = 
"ã«ã«ã¬ã«ããã®ãã¯ã¯ã°ã¯ã±ã±ã²ã±ã³ã³ã´ã³ãµãµã¶ãµã·ã·ã¸ã·ã¹ã¹ãºã¹ã»ã»ã¼ã»ã½ã½ã¾ã½ã¿ã¿ãã¿ããããããã

ããããããããããããããããããããããããããããã";
+            AssertAnalyzesTo(keywordAnalyzer, source, new String[] { target });
+
+            // Test all katakana iteration variants with dakuten
+            source = 
"ã«ã¾ã¬ã¾ãã¾ã®ã¾ã¯ã¾ã°ã¾ã±ã¾ã²ã¾ã³ã¾ã´ã¾ãµã¾ã¶ã¾ã·ã¾ã¸ã¾ã¹ã¾ãºã¾ã»ã¾ã¼ã¾ã½ã¾ã¾ã¾ã¿ã¾ãã¾ãã¾ãã¾ãã¾ã

ã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾";
+            target = 
"ã«ã¬ã¬ã¬ãã®ã®ã®ã¯ã°ã°ã°ã±ã²ã²ã²ã³ã´ã´ã´ãµã¶ã¶ã¶ã·ã¸ã¸ã¸ã¹ãºãºãºã»ã¼ã¼ã¼ã½ã¾ã¾ã¾ã¿ããããããããã
ãã
ãããããããããããããããããããããããããããã";
+            AssertAnalyzesTo(keywordAnalyzer, source, new String[] { target });
+        }
+
+        [Test]
+        public void TestRandomStrings()
+        {
+            // Blast some random strings through
+            CheckRandomData(Random(), keywordAnalyzer, 1000 * 
RANDOM_MULTIPLIER);
+        }
+
+        [Test]
+        public void TestRandomHugeStrings()
+        {
+            // Blast some random strings through
+            CheckRandomData(Random(), keywordAnalyzer, 100 * 
RANDOM_MULTIPLIER, 8192);
+        }
+
+        private void assertCharFilterEquals(CharFilter filter, String expected)
+        {
+            String actual = readFully(filter);
+            assertEquals(expected, actual);
+        }
+
+        private String readFully(TextReader stream)
+        {
+            StringBuilder buffer = new StringBuilder();
+            int ch;
+            while ((ch = stream.Read()) != -1)
+            {
+                buffer.append((char)ch);
+            }
+            return buffer.toString();
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilterFactory.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilterFactory.cs
 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilterFactory.cs
new file mode 100644
index 0000000..88f71a9
--- /dev/null
+++ 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilterFactory.cs
@@ -0,0 +1,108 @@
+ï»¿using Lucene.Net.Support;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Simple tests for <see cref="JapaneseIterationMarkCharFilterFactory"/>
+    /// </summary>
+    public class TestJapaneseIterationMarkCharFilterFactory : 
BaseTokenStreamTestCase
+    {
+        [Test]
+        public void TestIterationMarksWithKeywordTokenizer()
+        {
+            String text = "æãé¦¬é¹¿ãã
ããã¨ããããããã¹ã¾";
+            JapaneseIterationMarkCharFilterFactory filterFactory = new 
JapaneseIterationMarkCharFilterFactory(new Dictionary<String, String>());
+            TextReader filter = filterFactory.Create(new StringReader(text));
+            TokenStream tokenStream = new MockTokenizer(filter, 
MockTokenizer.KEYWORD, false);
+            AssertTokenStreamContents(tokenStream, new String[] { 
"ææé¦¬é¹¿é¦¬é¹¿ããã¨ããã©ãããã¹ãº" });
+        }
+
+        [Test]
+        public void TestIterationMarksWithJapaneseTokenizer()
+        {
+            JapaneseTokenizerFactory tokenizerFactory = new 
JapaneseTokenizerFactory(new Dictionary<String, String>());
+            tokenizerFactory.Inform(new StringMockResourceLoader(""));
+
+            JapaneseIterationMarkCharFilterFactory filterFactory = new 
JapaneseIterationMarkCharFilterFactory(new Dictionary<String, String>());
+            TextReader filter = filterFactory.Create(
+                new StringReader("æãé¦¬é¹¿ãã
ããã¨ããããããã¹ã¾")
+            );
+            TokenStream tokenStream = tokenizerFactory.Create(filter);
+            AssertTokenStreamContents(tokenStream, new String[] { "ææ", 
"é¦¬é¹¿é¦¬é¹¿ãã", "ã¨ããã©ãã", "ã", "ã¹ãº" });
+        }
+
+        [Test]
+        public void TestKanjiOnlyIterationMarksWithJapaneseTokenizer()
+        {
+            JapaneseTokenizerFactory tokenizerFactory = new 
JapaneseTokenizerFactory(new Dictionary<String, String>());
+            tokenizerFactory.Inform(new StringMockResourceLoader(""));
+
+            IDictionary<String, String> filterArgs = new Dictionary<String, 
String>();
+            filterArgs.Put("normalizeKanji", "true");
+            filterArgs.Put("normalizeKana", "false");
+            JapaneseIterationMarkCharFilterFactory filterFactory = new 
JapaneseIterationMarkCharFilterFactory(filterArgs);
+
+            TextReader filter = filterFactory.Create(
+                new StringReader("æãé¦¬é¹¿ãã
ããã¨ããããããã¹ã¾")
+            );
+            TokenStream tokenStream = tokenizerFactory.Create(filter);
+            AssertTokenStreamContents(tokenStream, new String[] { "ææ", 
"é¦¬é¹¿é¦¬é¹¿ãã", "ã¨ãã", "ã", "ã", "ã", "ãã¹", "ã¾" });
+        }
+
+        [Test]
+        public void TestKanaOnlyIterationMarksWithJapaneseTokenizer()
+        {
+            JapaneseTokenizerFactory tokenizerFactory = new 
JapaneseTokenizerFactory(new Dictionary<String, String>());
+            tokenizerFactory.Inform(new StringMockResourceLoader(""));
+
+            IDictionary<String, String> filterArgs = new Dictionary<String, 
String>();
+            filterArgs.Put("normalizeKanji", "false");
+            filterArgs.Put("normalizeKana", "true");
+            JapaneseIterationMarkCharFilterFactory filterFactory = new 
JapaneseIterationMarkCharFilterFactory(filterArgs);
+
+            TextReader filter = filterFactory.Create(
+                new StringReader("æãé¦¬é¹¿ãã
ããã¨ããããããã¹ã¾")
+            );
+            TokenStream tokenStream = tokenizerFactory.Create(filter);
+            AssertTokenStreamContents(tokenStream, new String[] { "æã", 
"é¦¬é¹¿", "ã", "ã", "ãã", "ã¨ããã©ãã", "ã", "ã¹ãº" });
+        }
+
+        /** Test that bogus arguments result in exception */
+        [Test]
+        public void TestBogusArguments()
+        {
+            try
+            {
+                new JapaneseIterationMarkCharFilterFactory(new 
Dictionary<String, String>() {
+                    { "bogusArg", "bogusValue" }
+                });
+                fail();
+            }
+            catch (ArgumentException expected)
+            {
+                assertTrue(expected.Message.Contains("Unknown parameters"));
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilter.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilter.cs 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilter.cs
new file mode 100644
index 0000000..cbbc95b
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilter.cs
@@ -0,0 +1,100 @@
+ï»¿using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Miscellaneous;
+using Lucene.Net.Analysis.Util;
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Tests for <see cref="JapaneseKatakanaStemFilter"/>
+    /// </summary>
+    public class TestJapaneseKatakanaStemFilter : BaseTokenStreamTestCase
+    {
+        private Analyzer analyzer = Analyzer.NewAnonymous(createComponents: 
(fieldName, reader) =>
+        {
+            // Use a MockTokenizer here since this filter doesn't really 
depend on Kuromoji
+            Tokenizer source = new MockTokenizer(reader, 
MockTokenizer.WHITESPACE, false);
+            return new TokenStreamComponents(source, new 
JapaneseKatakanaStemFilter(source));
+        });
+
+        /**
+         * Test a few common katakana spelling variations.
+         * <p>
+         * English translations are as follows:
+         * <ul>
+         *   <li>copy</li>
+         *   <li>coffee</li>
+         *   <li>taxi</li>
+         *   <li>party</li>
+         *   <li>party (without long sound)</li>
+         *   <li>center</li>
+         * </ul>
+         * Note that we remove a long sound in the case of "coffee" that is 
required.
+         * </p>
+         */
+        [Test]
+        public void TestStemVariants()
+        {
+            AssertAnalyzesTo(analyzer, "ã³ãã¼ ã³ã¼ãã¼ ã¿ã¯ã·ã¼ 
ãã¼ãã£ã¼ ãã¼ãã£ ã»ã³ã¿ã¼",
+          new String[] { "ã³ãã¼", "ã³ã¼ã", "ã¿ã¯ã·", 
"ãã¼ãã£", "ãã¼ãã£", "ã»ã³ã¿" },
+          new int[] { 0, 4, 9, 14, 20, 25 },
+          new int[] { 3, 8, 13, 19, 24, 29 });
+        }
+
+        [Test]
+        public void TestKeyword()
+        {
+            CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, 
AsSet("ã³ã¼ãã¼"), false);
+            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, 
reader) =>
+            {
+                Tokenizer source = new MockTokenizer(reader, 
MockTokenizer.WHITESPACE, false);
+                TokenStream sink = new SetKeywordMarkerFilter(source, 
exclusionSet);
+                return new TokenStreamComponents(source, new 
JapaneseKatakanaStemFilter(sink));
+            });
+            CheckOneTerm(a, "ã³ã¼ãã¼", "ã³ã¼ãã¼");
+        }
+
+        [Test]
+        public void TestUnsupportedHalfWidthVariants()
+        {
+            // The below result is expected since only full-width katakana is 
supported
+            AssertAnalyzesTo(analyzer, "ï¾ï½¸ï½¼ï½°", new String[] { 
"ï¾ï½¸ï½¼ï½°" });
+        }
+
+        [Test]
+        public void TestRandomData()
+        {
+            CheckRandomData(Random(), analyzer, 1000 * RANDOM_MULTIPLIER);
+        }
+
+        [Test]
+        public void TestEmptyTerm()
+        {
+            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, 
reader) =>
+            {
+                Tokenizer tokenizer = new KeywordTokenizer(reader);
+                return new TokenStreamComponents(tokenizer, new 
JapaneseKatakanaStemFilter(tokenizer));
+            });
+
+            CheckOneTerm(a, "", "");
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilterFactory.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilterFactory.cs
 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilterFactory.cs
new file mode 100644
index 0000000..49ac181
--- /dev/null
+++ 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilterFactory.cs
@@ -0,0 +1,62 @@
+ï»¿using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Simple tests for <see cref="JapaneseKatakanaStemFilterFactory"/>
+    /// </summary>
+    public class TestJapaneseKatakanaStemFilterFactory : 
BaseTokenStreamTestCase
+    {
+        [Test]
+        public void TestKatakanaStemming()
+        {
+            JapaneseTokenizerFactory tokenizerFactory = new 
JapaneseTokenizerFactory(new Dictionary<String, String>());
+            tokenizerFactory.Inform(new StringMockResourceLoader(""));
+            TokenStream tokenStream = tokenizerFactory.Create(
+                new 
StringReader("æå¾æ¥ãã¼ãã£ã¼ã«è¡ãäºå®ããããå³æ¸é¤¨ã§è³æãã³ãã¼ãã¾ããã")
+            );
+            JapaneseKatakanaStemFilterFactory filterFactory = new 
JapaneseKatakanaStemFilterFactory(new Dictionary<String, String>()); ;
+            AssertTokenStreamContents(filterFactory.Create(tokenStream),
+                new String[]{ "æå¾æ¥", "ãã¼ãã£", "ã«", "è¡ã", 
"äºå®", "ã", "ãã",   // ãã¼ãã£ã¼ should be stemmed
+                      "å³æ¸é¤¨", "ã§", "è³æ", "ã", "ã³ãã¼", "ã", 
"ã¾ã", "ã"} // ã³ãã¼ should not be stemmed
+            );
+        }
+
+        /** Test that bogus arguments result in exception */
+        [Test]
+        public void TestBogusArguments()
+        {
+            try
+            {
+                new JapaneseKatakanaStemFilterFactory(new Dictionary<String, 
String>() {
+                    { "bogusArg", "bogusValue" }
+                });
+                fail();
+            }
+            catch (ArgumentException expected)
+            {
+                assertTrue(expected.Message.Contains("Unknown parameters"));
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapanesePartOfSpeechStopFilterFactory.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapanesePartOfSpeechStopFilterFactory.cs
 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapanesePartOfSpeechStopFilterFactory.cs
new file mode 100644
index 0000000..617a1b8
--- /dev/null
+++ 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapanesePartOfSpeechStopFilterFactory.cs
@@ -0,0 +1,70 @@
+ï»¿using Lucene.Net.Support;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Simple tests for <see cref="JapanesePartOfSpeechStopFilterFactory"/>
+    /// </summary>
+    public class TestJapanesePartOfSpeechStopFilterFactory : 
BaseTokenStreamTestCase
+    {
+        [Test]
+        public void TestBasics()
+        {
+            String tags =
+                "#  verb-main:\n" +
+                "åè©-èªç«\n";
+
+            JapaneseTokenizerFactory tokenizerFactory = new 
JapaneseTokenizerFactory(new Dictionary<String, String>());
+            tokenizerFactory.Inform(new StringMockResourceLoader(""));
+            TokenStream ts = tokenizerFactory.Create(new 
StringReader("ç§ã¯å¶éã¹ãã¼ããè¶ããã"));
+            IDictionary<String, String> args = new Dictionary<String, 
String>();
+            args.Put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
+            args.Put("tags", "stoptags.txt");
+            JapanesePartOfSpeechStopFilterFactory factory = new 
JapanesePartOfSpeechStopFilterFactory(args);
+            factory.Inform(new StringMockResourceLoader(tags));
+            ts = factory.Create(ts);
+            AssertTokenStreamContents(ts,
+                new String[] { "ç§", "ã¯", "å¶é", "ã¹ãã¼ã", "ã" }
+            );
+        }
+
+        /** Test that bogus arguments result in exception */
+        [Test]
+        public void TestBogusArguments()
+        {
+            try
+            {
+                new JapanesePartOfSpeechStopFilterFactory(new 
Dictionary<String, String>() {
+                    { "luceneMatchVersion", TEST_VERSION_CURRENT.toString() },
+                    { "bogusArg", "bogusValue" }
+                });
+                fail();
+            }
+            catch (ArgumentException expected)
+            {
+                assertTrue(expected.Message.Contains("Unknown parameters"));
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilter.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilter.cs 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilter.cs
new file mode 100644
index 0000000..141db33
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilter.cs
@@ -0,0 +1,109 @@
+ï»¿using Lucene.Net.Analysis.Cjk;
+using Lucene.Net.Analysis.Core;
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Tests for <see cref="TestJapaneseReadingFormFilter"/>
+    /// </summary>
+    public class TestJapaneseReadingFormFilter : BaseTokenStreamTestCase
+    {
+        private Analyzer katakanaAnalyzer = 
Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+        {
+            Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, 
JapaneseTokenizerMode.SEARCH);
+            return new TokenStreamComponents(tokenizer, new 
JapaneseReadingFormFilter(tokenizer, false));
+        });
+
+        private Analyzer romajiAnalyzer = 
Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+        {
+            Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, 
JapaneseTokenizerMode.SEARCH);
+            return new TokenStreamComponents(tokenizer, new 
JapaneseReadingFormFilter(tokenizer, true));
+        });
+
+
+        [Test]
+        public void TestKatakanaReadings()
+        {
+            AssertAnalyzesTo(katakanaAnalyzer, "ä»å¤ã¯ããã¼ãå
çã¨è©±ãã",
+                new String[] { "ã³ã³ã¤", "ã", "ããã¼ã", 
"ã»ã³ã»ã¤", "ã", "ããã·", "ã¿" }
+            );
+        }
+
+        [Test]
+        public void TestKatakanaReadingsHalfWidth()
+        {
+            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, 
reader) =>
+            {
+                Tokenizer tokenizer = new JapaneseTokenizer(reader, null, 
true, JapaneseTokenizerMode.SEARCH);
+                TokenStream stream = new CJKWidthFilter(tokenizer);
+                return new TokenStreamComponents(tokenizer, new 
JapaneseReadingFormFilter(stream, false));
+            });
+
+            AssertAnalyzesTo(a, "ä»å¤ã¯ï¾ï¾ï¾ï½°ï¾åçã¨è©±ãã",
+                new String[] { "ã³ã³ã¤", "ã", "ããã¼ã", 
"ã»ã³ã»ã¤", "ã", "ããã·", "ã¿" }
+            );
+        }
+
+        [Test]
+        public void TestRomajiReadings()
+        {
+            AssertAnalyzesTo(romajiAnalyzer, "ä»å¤ã¯ããã¼ãå
çã¨è©±ãã",
+                new String[] { "kon'ya", "ha", "robato", "sensei", "to", 
"hanashi", "ta" }
+            );
+        }
+
+        [Test]
+        public void TestRomajiReadingsHalfWidth()
+        {
+            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, 
reader) =>
+            {
+                Tokenizer tokenizer = new JapaneseTokenizer(reader, null, 
true, JapaneseTokenizerMode.SEARCH);
+                TokenStream stream = new CJKWidthFilter(tokenizer);
+                return new TokenStreamComponents(tokenizer, new 
JapaneseReadingFormFilter(stream, true));
+            });
+
+            AssertAnalyzesTo(a, "ä»å¤ã¯ï¾ï¾ï¾ï½°ï¾åçã¨è©±ãã",
+                new String[] { "kon'ya", "ha", "robato", "sensei", "to", 
"hanashi", "ta" }
+            );
+        }
+
+        [Test]
+        public void TestRandomData()
+        {
+            Random random = Random();
+            CheckRandomData(random, katakanaAnalyzer, 1000 * 
RANDOM_MULTIPLIER);
+            CheckRandomData(random, romajiAnalyzer, 1000 * RANDOM_MULTIPLIER);
+        }
+
+        [Test]
+        public void TestEmptyTerm()
+        {
+            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, 
reader) =>
+            {
+                Tokenizer tokenizer = new KeywordTokenizer(reader);
+                return new TokenStreamComponents(tokenizer, new 
JapaneseReadingFormFilter(tokenizer));
+            });
+
+            CheckOneTerm(a, "", "");
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilterFactory.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilterFactory.cs
 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilterFactory.cs
new file mode 100644
index 0000000..053652b
--- /dev/null
+++ 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilterFactory.cs
@@ -0,0 +1,59 @@
+ï»¿using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Simple tests for <see cref="JapaneseReadingFormFilterFactory"/>
+    /// </summary>
+    public class TestJapaneseReadingFormFilterFactory : BaseTokenStreamTestCase
+    {
+        [Test]
+        public void TestReadings()
+        {
+            JapaneseTokenizerFactory tokenizerFactory = new 
JapaneseTokenizerFactory(new Dictionary<String, String>());
+            tokenizerFactory.Inform(new StringMockResourceLoader(""));
+            TokenStream tokenStream = tokenizerFactory.Create(new 
StringReader("åã»ã©ãã«ãªã³ããæ¥ã¾ããã"));
+            JapaneseReadingFormFilterFactory filterFactory = new 
JapaneseReadingFormFilterFactory(new Dictionary<String, String>());
+            AssertTokenStreamContents(filterFactory.Create(tokenStream),
+                new String[] { "ãµã", "ãã", "ãã«ãªã³", "ã«ã©", 
"ã", "ãã·", "ã¿" }
+            );
+        }
+
+        /** Test that bogus arguments result in exception */
+        [Test]
+        public void TestBogusArguments()
+        {
+            try
+            {
+                new JapaneseReadingFormFilterFactory(new Dictionary<String, 
String>() {
+                { "bogusArg", "bogusValue" }
+            });
+                fail();
+            }
+            catch (ArgumentException expected)
+            {
+                assertTrue(expected.Message.Contains("Unknown parameters"));
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs
new file mode 100644
index 0000000..0a1f819
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs
@@ -0,0 +1,846 @@
+ï»¿using Lucene.Net.Analysis.Ja.Dict;
+using Lucene.Net.Analysis.Ja.TokenAttributes;
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Attributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class TestJapaneseTokenizer : BaseTokenStreamTestCase
+    {
+        public static UserDictionary ReadDict()
+        {
+            Stream @is = 
typeof(TestJapaneseTokenizer).getResourceAsStream("userdict.txt");
+            if (@is == null)
+            {
+                throw new Exception("Cannot find userdict.txt in test 
classpath!");
+            }
+            try
+            {
+                try
+                {
+                    TextReader reader = new StreamReader(@is, Encoding.UTF8);
+                    return new UserDictionary(reader);
+                }
+                finally
+                {
+                    @is.Dispose();
+                }
+            }
+            catch (IOException ioe)
+            {
+                throw new Exception(ioe.ToString(), ioe);
+            }
+        }
+
+        private Analyzer analyzer = Analyzer.NewAnonymous(createComponents: 
(fieldName, reader) =>
+        {
+            Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), 
false, JapaneseTokenizerMode.SEARCH);
+            return new TokenStreamComponents(tokenizer, tokenizer);
+        });
+
+
+        private Analyzer analyzerNormal = 
Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+        {
+            Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), 
false, JapaneseTokenizerMode.NORMAL);
+            return new TokenStreamComponents(tokenizer, tokenizer);
+        });
+
+        private Analyzer analyzerNoPunct = 
Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+        {
+            Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), 
true, JapaneseTokenizerMode.SEARCH);
+            return new TokenStreamComponents(tokenizer, tokenizer);
+        });
+
+
+        private Analyzer extendedModeAnalyzerNoPunct = 
Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+        {
+            Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), 
true, JapaneseTokenizerMode.EXTENDED);
+            return new TokenStreamComponents(tokenizer, tokenizer);
+        });
+
+
+        [Test]
+        public void TestNormalMode()
+        {
+            AssertAnalyzesTo(analyzerNormal,
+                             "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢",
+                             new String[] { 
"ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢" });
+        }
+
+        [Test]
+        public void TestDecomposition1()
+        {
+            AssertAnalyzesTo(analyzerNoPunct, 
"æ¬æ¥ã¯ãè²§å°å±¤ã®å¥³æ§ãåä¾ã«å»çä¿è·ãæä¾ããããã«åµè¨ãããå¶åº¦ã§ããã"
 +
+                                 "ã¢ã¡ãªã«ä½æå¾è
å»çæ´å©å¶åº¦ããä»æ¥ã§ã¯ããã®äºç®ã®ç´ï¼åã®ï¼ãèäººã«è²»ããã¦ããã",
+             new String[] { "æ¬æ¥", "ã¯",  "è²§å°", "å±¤", "ã®", 
"å¥³æ§", "ã", "åä¾", "ã«", "å»ç", "ä¿è·", "ã",
+                    "æä¾", "ãã", "ãã", "ã«", "åµè¨", "ã", 
"ã", "ã", "å¶åº¦", "ã§", "ãã",  "ã¢ã¡ãªã«",
+                    "ä½", "æå¾", "è", "å»ç", "æ´å©", "å¶åº¦", 
"ã",  "ä»æ¥", "ã§", "ã¯",  "ãã®",
+                    "äºç®", "ã®", "ç´", "ï¼", "åã®", "ï¼", "ã", 
"èäºº", "ã«", "è²»ãã", "ã¦", "ãã" },
+             new int[] { 0, 2, 4, 6, 7,  8, 10, 11, 13, 14, 16, 18, 19, 21, 
23, 25, 26, 28, 29, 30,
+                 31, 33, 34, 37, 41, 42, 44, 45, 47, 49, 51, 53, 55, 56, 58, 
60,
+                 62, 63, 64, 65, 67, 68, 69, 71, 72, 75, 76 },
+             new int[] { 2, 3, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 
25, 26, 28, 29, 30, 31,
+                 33, 34, 36, 41, 42, 44, 45, 47, 49, 51, 52, 55, 56, 57, 60, 
62,
+                 63, 64, 65, 67, 68, 69, 71, 72, 75, 76, 78 }
+            );
+        }
+
+        [Test]
+        public void TestDecomposition2()
+        {
+            AssertAnalyzesTo(analyzerNoPunct, "éº»è¬ã®å¯å£²ã¯æ 
¹ãããçµ¶ãããªããã°ãªããªã",
+              new String[] { "éº»è¬", "ã®", "å¯å£²", "ã¯", "æ ¹ããã", 
"çµ¶ãã", "ãªãã", "ã°", "ãªã", "ãªã" },
+              new int[] { 0, 2, 3, 5, 6, 10, 13, 16, 17, 19 },
+              new int[] { 2, 3, 5, 6, 10, 13, 16, 17, 19, 21 }
+            );
+        }
+
+        [Test]
+        public void TestDecomposition3()
+        {
+            AssertAnalyzesTo(analyzerNoPunct, 
"éå¥³ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã",
+              new String[] { "éå¥³", "ç©", "å¤§å°", "ãã·ã¥ã¼", 
"ãããã³ã¹" },
+              new int[] { 0, 2, 3, 5, 10 },
+              new int[] { 2, 3, 5, 9, 15 }
+            );
+        }
+
+        [Test]
+        public void TestDecomposition4()
+        {
+            AssertAnalyzesTo(analyzer, "ããã¯æ¬ã§ã¯ãªã",
+              new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" },
+              new int[] { 0, 2, 3, 4, 5, 6 },
+              new int[] { 2, 3, 4, 5, 6, 8 }
+            );
+        }
+
+        /* Note this is really a stupid test just to see if things arent 
horribly slow.
+         * ideally the test would actually fail instead of hanging...
+         */
+        [Test]
+        public void TestDecomposition5()
+        {
+            TokenStream ts = analyzer.GetTokenStream("bogus", 
"ãããããããããããããããããããããããããããããããããããããããã");
+            try
+            {
+                ts.Reset();
+                while (ts.IncrementToken())
+                {
+
+                }
+                ts.End();
+            }
+            finally
+            {
+                IOUtils.DisposeWhileHandlingException(ts);
+            }
+        }
+
+        /*
+          // NOTE: intentionally fails!  Just trying to debug this
+          // one input...
+        public void testDecomposition6() throws Exception {
+          assertAnalyzesTo(analyzer, "å¥è¯åç«¯ç§å¦æè¡å¤§å¦é¢å¤§å¦",
+            new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" },
+            new int[] { 0, 2, 3, 4, 5, 6 },
+            new int[] { 2, 3, 4, 5, 6, 8 }
+                           );
+        }
+        */
+
+        /** Tests that sentence offset is incorporated into the resulting 
offsets */
+        [Test]
+        public void TestTwoSentences()
+        {
+            /*
+            //TokenStream ts = a.tokenStream("foo", 
"å¦¹ã®å²åã§ããä¿ºã¨å¹´åã§ãä»åé¨çã§ãã");
+            TokenStream ts = analyzer.tokenStream("foo", 
"&#x250cdf66<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;");
+            ts.reset();
+            CharTermAttribute termAtt = 
ts.addAttribute(CharTermAttribute.class);
+            while(ts.incrementToken()) {
+              System.out.println("  " + termAtt.toString());
+            }
+            System.out.println("DONE PARSE\n\n");
+            */
+
+            AssertAnalyzesTo(analyzerNoPunct, 
"éå¥³ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã 
éå¥³ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã",
+              new String[] { "éå¥³", "ç©", "å¤§å°", "ãã·ã¥ã¼", 
"ãããã³ã¹", "éå¥³", "ç©", "å¤§å°", "ãã·ã¥ã¼", "ãããã³ã¹" 
},
+              new int[] { 0, 2, 3, 5, 10, 17, 19, 20, 22, 27 },
+              new int[] { 2, 3, 5, 9, 15, 19, 20, 22, 26, 32 }
+            );
+        }
+
+        /** blast some random strings through the analyzer */
+        [Test]
+        public void TestRandomStrings()
+        {
+            CheckRandomData(Random(), analyzer, 1000 * RANDOM_MULTIPLIER);
+            CheckRandomData(Random(), analyzerNoPunct, 1000 * 
RANDOM_MULTIPLIER);
+        }
+
+        /** blast some random large strings through the analyzer */
+        [Test]
+        public void TestRandomHugeStrings()
+        {
+            Random random = Random();
+            CheckRandomData(random, analyzer, 100 * RANDOM_MULTIPLIER, 8192);
+            CheckRandomData(random, analyzerNoPunct, 100 * RANDOM_MULTIPLIER, 
8192);
+        }
+
+        [Test]
+        public void TestRandomHugeStringsMockGraphAfter()
+        {
+            // Randomly inject graph tokens after JapaneseTokenizer:
+            Random random = Random();
+            CheckRandomData(random,
+                            Analyzer.NewAnonymous(createComponents: 
(fieldName, reader) =>
+                            {
+                                Tokenizer tokenizer = new 
JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.SEARCH);
+                                TokenStream graph = new 
MockGraphTokenFilter(Random(), tokenizer);
+                                return new TokenStreamComponents(tokenizer, 
graph);
+                            }),
+                    100 * RANDOM_MULTIPLIER, 8192);
+        }
+
+        [Test]
+        public void TestLargeDocReliability()
+        {
+            for (int i = 0; i < 100; i++)
+            {
+                String s = TestUtil.RandomUnicodeString(Random(), 10000);
+                TokenStream ts = analyzer.GetTokenStream("foo", s);
+                try
+                {
+                    ts.Reset();
+                    while (ts.IncrementToken())
+                    {
+                    }
+                    ts.End();
+                }
+                finally
+                {
+                    IOUtils.DisposeWhileHandlingException(ts);
+                }
+            }
+        }
+
+        /** simple test for supplementary characters */
+        [Test]
+        public void TestSurrogates()
+        {
+            AssertAnalyzesTo(analyzer, "ð©¬è±éä¹æ¯ç",
+              new String[] { "ð©¬", "è±", "é", "ä¹", "æ¯", "ç" });
+        }
+
+        /** random test ensuring we don't ever split supplementaries */
+        [Test]
+        public void TestSurrogates2()
+        {
+            int numIterations = AtLeast(10000);
+            for (int i = 0; i < numIterations; i++)
+            {
+                if (VERBOSE)
+                {
+                    Console.WriteLine("\nTEST: iter=" + i);
+                }
+                String s = TestUtil.RandomUnicodeString(Random(), 100);
+                TokenStream ts = analyzer.GetTokenStream("foo", s);
+                try
+                {
+                    ICharTermAttribute termAtt = 
ts.AddAttribute<ICharTermAttribute>();
+                    ts.Reset();
+                    while (ts.IncrementToken())
+                    {
+                        assertTrue(UnicodeUtil.ValidUTF16String(termAtt));
+                    }
+                    ts.End();
+                }
+                finally
+                {
+                    IOUtils.DisposeWhileHandlingException(ts);
+                }
+            }
+        }
+
+        [Test]
+        public void TestOnlyPunctuation()
+        {
+            TokenStream ts = analyzerNoPunct.GetTokenStream("foo", 
"ãããã");
+            try
+            {
+                ts.Reset();
+                assertFalse(ts.IncrementToken());
+                ts.End();
+            }
+            finally
+            {
+                IOUtils.DisposeWhileHandlingException(ts);
+            }
+        }
+
+        [Test]
+        public void TestOnlyPunctuationExtended()
+        {
+            TokenStream ts = extendedModeAnalyzerNoPunct.GetTokenStream("foo", 
"......");
+            try
+            {
+                ts.Reset();
+                assertFalse(ts.IncrementToken());
+                ts.End();
+            }
+            finally
+            {
+                IOUtils.DisposeWhileHandlingException(ts);
+            }
+        }
+
+        // note: test is kinda silly since kuromoji emits punctuation tokens.
+        // but, when/if we filter these out it will be useful.
+        [Test]
+        public void TestEnd()
+        {
+            AssertTokenStreamContents(analyzerNoPunct.GetTokenStream("foo", 
"ããã¯æ¬ã§ã¯ãªã"),
+                new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" 
},
+                new int[] { 0, 2, 3, 4, 5, 6 },
+                new int[] { 2, 3, 4, 5, 6, 8 },
+                new int?(8)
+            );
+
+            AssertTokenStreamContents(analyzerNoPunct.GetTokenStream("foo", 
"ããã¯æ¬ã§ã¯ãªã    "),
+                new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" 
},
+                new int[] { 0, 2, 3, 4, 5, 6, 8 },
+                new int[] { 2, 3, 4, 5, 6, 8, 9 },
+                new int?(12)
+            );
+        }
+
+        [Test]
+        public void TestUserDict()
+        {
+            // Not a great test because w/o userdict.txt the
+            // segmentation is the same:
+            AssertTokenStreamContents(analyzer.GetTokenStream("foo", 
"é¢è¥¿å½éç©ºæ¸¯ã«è¡ã£ã"),
+                                      new String[] { "é¢è¥¿", "å½é", 
"ç©ºæ¸¯", "ã«", "è¡ã£", "ã" },
+                                      new int[] { 0, 2, 4, 6, 7, 9 },
+                                      new int[] { 2, 4, 6, 7, 9, 10 },
+                                      new int?(10)
+            );
+        }
+
+        [Test]
+        public void TestUserDict2()
+        {
+            // Better test: w/o userdict the segmentation is different:
+            AssertTokenStreamContents(analyzer.GetTokenStream("foo", 
"æéé¾"),
+                                      new String[] { "æéé¾" },
+                                      new int[] { 0 },
+                                      new int[] { 3 },
+                                      new int?(3)
+            );
+        }
+
+        [Test]
+        public void TestUserDict3()
+        {
+            // Test entry that breaks into multiple tokens:
+            AssertTokenStreamContents(analyzer.GetTokenStream("foo", "abcd"),
+                                      new String[] { "a", "b", "cd" },
+                                      new int[] { 0, 1, 2 },
+                                      new int[] { 1, 2, 4 },
+                                      new int?(4)
+            );
+        }
+
+        // HMM: fails (segments as a/b/cd/efghij)... because the
+        // two paths have exactly equal paths (1 KNOWN + 1
+        // UNKNOWN) and we don't seem to favor longer KNOWN /
+        // shorter UNKNOWN matches:
+
+        /*
+        public void testUserDict4()  {
+          // Test entry that has another entry as prefix
+          assertTokenStreamContents(analyzer.tokenStream("foo", "abcdefghij"),
+                                    new String[] { "ab", "cd", "efg", "hij"  },
+                                    new int[] { 0, 2, 4, 7 },
+                                    new int[] { 2, 4, 7, 10 },
+                                    new int?(10)
+          );
+        }
+        */
+
+        [Test]
+        public void TestSegmentation()
+        {
+            // Skip tests for Michelle Kwan -- UniDic segments Kwan as ã¯ 
ã¯ã³
+            //   String input = 
"ãã·ã§ã«ã»ã¯ã¯ã³ãåªåãã¾ãããã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã";
+            //   String[] surfaceForms = {
+            //        "ãã·ã§ã«", "ã»", "ã¯ã¯ã³", "ã", "åªå", 
"ã", "ã¾ã", "ã", "ã",
+            //        "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", 
"ã¾ã", "ã",
+            //        "ãããããã", "ã"
+            //   };
+            String input = 
"ã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã";
+            String[]
+            surfaceForms = {
+                "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", 
"ã¾ã", "ã",
+                "ãããããã", "ã"
+            };
+            AssertAnalyzesTo(analyzer,
+                             input,
+                             surfaceForms);
+        }
+
+        [Test]
+        public void TestLatticeToDot()
+        {
+            GraphvizFormatter gv2 = new 
GraphvizFormatter(ConnectionCosts.GetInstance());
+            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: 
(fieldName, reader) =>
+            {
+                JapaneseTokenizer tokenizer = new JapaneseTokenizer(reader, 
ReadDict(), false, JapaneseTokenizerMode.SEARCH)
+                {
+                    GraphvizFormatter = gv2
+                };
+                return new TokenStreamComponents(tokenizer, tokenizer);
+            });
+
+
+            String input = 
"ã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã";
+            String[] surfaceForms = {
+                "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", 
"ã¾ã", "ã",
+                "ãããããã", "ã"
+            };
+            AssertAnalyzesTo(analyzer,
+                             input,
+                             surfaceForms);
+
+
+            assertTrue(gv2.Finish().IndexOf("22.0") != -1);
+        }
+
+        private void assertReadings(String input, params String[] readings)
+        {
+            TokenStream ts = analyzer.GetTokenStream("ignored", input);
+            try
+            {
+                IReadingAttribute readingAtt = 
ts.AddAttribute<IReadingAttribute>();
+                ts.Reset();
+                foreach (String reading in readings)
+                {
+                    assertTrue(ts.IncrementToken());
+                    assertEquals(reading, readingAtt.GetReading());
+                }
+                assertFalse(ts.IncrementToken());
+                ts.End();
+            }
+            finally
+            {
+                IOUtils.DisposeWhileHandlingException(ts);
+            }
+        }
+
+        private void assertPronunciations(String input, params String[] 
pronunciations)
+        {
+            TokenStream ts = analyzer.GetTokenStream("ignored", input);
+            try
+            {
+                IReadingAttribute readingAtt = 
ts.AddAttribute<IReadingAttribute>();
+                ts.Reset();
+                foreach (String pronunciation in pronunciations)
+                {
+                    assertTrue(ts.IncrementToken());
+                    assertEquals(pronunciation, readingAtt.GetPronunciation());
+                }
+                assertFalse(ts.IncrementToken());
+                ts.End();
+            }
+            finally
+            {
+                IOUtils.DisposeWhileHandlingException(ts);
+            }
+        }
+
+        private void assertBaseForms(String input, params String[] baseForms)
+        {
+            TokenStream ts = analyzer.GetTokenStream("ignored", input);
+            try
+            {
+                IBaseFormAttribute baseFormAtt = 
ts.AddAttribute<IBaseFormAttribute>();
+                ts.Reset();
+                foreach (String baseForm in baseForms)
+                {
+                    assertTrue(ts.IncrementToken());
+                    assertEquals(baseForm, baseFormAtt.GetBaseForm());
+                }
+                assertFalse(ts.IncrementToken());
+                ts.End();
+            }
+            finally
+            {
+                IOUtils.DisposeWhileHandlingException(ts);
+            }
+        }
+
+        private void assertInflectionTypes(String input, params String[] 
inflectionTypes)
+        {
+            TokenStream ts = analyzer.GetTokenStream("ignored", input);
+            try
+            {
+                IInflectionAttribute inflectionAtt = 
ts.AddAttribute<IInflectionAttribute>();
+                ts.Reset();
+                foreach (String inflectionType in inflectionTypes)
+                {
+                    assertTrue(ts.IncrementToken());
+                    assertEquals(inflectionType, 
inflectionAtt.GetInflectionType());
+                }
+                assertFalse(ts.IncrementToken());
+                ts.End();
+            }
+            finally
+            {
+                IOUtils.DisposeWhileHandlingException(ts);
+            }
+        }
+
+        private void assertInflectionForms(String input, params String[] 
inflectionForms)
+        {
+            TokenStream ts = analyzer.GetTokenStream("ignored", input);
+            try
+            {
+                IInflectionAttribute inflectionAtt = 
ts.AddAttribute<IInflectionAttribute>();
+                ts.Reset();
+                foreach (String inflectionForm in inflectionForms)
+                {
+                    assertTrue(ts.IncrementToken());
+                    assertEquals(inflectionForm, 
inflectionAtt.GetInflectionForm());
+                }
+                assertFalse(ts.IncrementToken());
+                ts.End();
+            }
+            finally
+            {
+                IOUtils.DisposeWhileHandlingException(ts);
+            }
+        }
+
+        private void assertPartsOfSpeech(String input, params String[] 
partsOfSpeech)
+        {
+            TokenStream ts = analyzer.GetTokenStream("ignored", input);
+            try
+            {
+                IPartOfSpeechAttribute partOfSpeechAtt = 
ts.AddAttribute<IPartOfSpeechAttribute>();
+                ts.Reset();
+                foreach (String partOfSpeech in partsOfSpeech)
+                {
+                    assertTrue(ts.IncrementToken());
+                    assertEquals(partOfSpeech, 
partOfSpeechAtt.GetPartOfSpeech());
+                }
+                assertFalse(ts.IncrementToken());
+                ts.End();
+            }
+            finally
+            {
+                IOUtils.DisposeWhileHandlingException(ts);
+            }
+        }
+
+        [Test]
+        public void TestReadings()
+        {
+            assertReadings("å¯¿å¸ãé£ã¹ããã§ãã",
+                           "ã¹ã·",
+                           "ã¬",
+                           "ã¿ã",
+                           "ã¿ã¤",
+                           "ãã¹",
+                           "ã");
+        }
+
+        [Test]
+        public void TestReadings2()
+        {
+            assertReadings("å¤ãã®å¦çãè©¦é¨ã«è½ã¡ãã",
+                           "ãªãªã¯",
+                           "ã",
+                           "ã¬ã¯ã»ã¤",
+                           "ã¬",
+                           "ã·ã±ã³",
+                           "ã",
+                           "ãªã",
+                           "ã¿",
+                           "ã");
+        }
+
+        [Test]
+        public void TestPronunciations()
+        {
+            assertPronunciations("å¯¿å¸ãé£ã¹ããã§ãã",
+                                 "ã¹ã·",
+                                 "ã¬",
+                                 "ã¿ã",
+                                 "ã¿ã¤",
+                                 "ãã¹",
+                                 "ã");
+        }
+
+        [Test]
+        public void TestPronunciations2()
+        {
+            // pronunciation differs from reading here
+            assertPronunciations("å¤ãã®å¦çãè©¦é¨ã«è½ã¡ãã",
+                                 "ãªã¼ã¯",
+                                 "ã",
+                                 "ã¬ã¯ã»ã¤",
+                                 "ã¬",
+                                 "ã·ã±ã³",
+                                 "ã",
+                                 "ãªã",
+                                 "ã¿",
+                                 "ã");
+        }
+
+        [Test]
+        public void TestBasicForms()
+        {
+            assertBaseForms("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã",
+                            null,
+                            null,
+                            null,
+                            null,
+                            null,
+                            null,
+                            "ãã",
+                            null,
+                            null);
+        }
+
+        [Test]
+        public void TestInflectionTypes()
+        {
+            assertInflectionTypes("ããã¯ã¾ã 
å®é¨æ®µéã«ããã¾ãã",
+                                  null,
+                                  null,
+                                  null,
+                                  null,
+                                  null,
+                                  null,
+                                  "äºæ®µã»ã©è¡",
+                                  "ç¹æ®ã»ãã¹",
+                                  null);
+        }
+
+        [Test]
+        public void TestInflectionForms()
+        {
+            assertInflectionForms("ããã¯ã¾ã 
å®é¨æ®µéã«ããã¾ãã",
+                                  null,
+                                  null,
+                                  null,
+                                  null,
+                                  null,
+                                  null,
+                                  "é£ç¨å½¢",
+                                  "åºæ¬å½¢",
+                                  null);
+        }
+
+        [Test]
+        public void TestPartOfSpeech()
+        {
+            assertPartsOfSpeech("ããã¯ã¾ã 
å®é¨æ®µéã«ããã¾ãã",
+                                "åè©-ä»£åè©-ä¸è¬",
+                                "å©è©-ä¿å©è©",
+                                "å¯è©-å©è©é¡æ¥ç¶",
+                                "åè©-ãµå¤æ¥ç¶",
+                                "åè©-ä¸è¬",
+                                "å©è©-æ ¼å©è©-ä¸è¬",
+                                "åè©-èªç«",
+                                "å©åè©",
+                                "è¨å·-å¥ç¹");
+        }
+
+        // TODO: the next 2 tests are no longer using the first/last word ids, 
maybe lookup the words and fix?
+        // do we have a possibility to actually lookup the first and last word 
from dictionary?
+        [Test]
+        public void TestYabottai()
+        {
+            AssertAnalyzesTo(analyzer, "ãã¼ã£ãã",
+                             new String[] { "ãã¼ã£ãã" });
+        }
+
+        [Test]
+        public void TestTsukitosha()
+        {
+            AssertAnalyzesTo(analyzer, "çªãéãã",
+                             new String[] { "çªãéãã" });
+        }
+
+        [Test]
+        public void TestBocchan()
+        {
+            doTestBocchan(1);
+        }
+
+        [Test, LongRunningTest]//@Nightly
+        [Ignore("This test takes a long time to run - do it manually")]
+        public void TestBocchanBig()
+        {
+            doTestBocchan(100);
+        }
+
+        /*
+        public void testWikipedia()  {
+          final FileInputStream fis = new 
FileInputStream("/q/lucene/jawiki-20120220-pages-articles.xml");
+          final Reader r = new BufferedReader(new InputStreamReader(fis, 
StandardCharsets.UTF_8));
+
+          final long startTimeNS = System.nanoTime();
+          boolean done = false;
+          long compoundCount = 0;
+          long nonCompoundCount = 0;
+          long netOffset = 0;
+          while (!done) {
+            final TokenStream ts = analyzer.tokenStream("ignored", r);
+            ts.reset();
+            final PositionIncrementAttribute posIncAtt = 
ts.addAttribute(PositionIncrementAttribute.class);
+            final OffsetAttribute offsetAtt = 
ts.addAttribute(OffsetAttribute.class);
+            int count = 0;
+            while (true) {
+              if (!ts.incrementToken()) {
+                done = true;
+                break;
+              }
+              count++;
+              if (posIncAtt.getPositionIncrement() == 0) {
+                compoundCount++;
+              } else {
+                nonCompoundCount++;
+                if (nonCompoundCount % 1000000 == 0) {
+                  System.out.println(String.format("%.2f msec [pos=%d, %d, 
%d]",
+                                                   
(System.nanoTime()-startTimeNS)/1000000.0,
+                                                   netOffset + 
offsetAtt.startOffset(),
+                                                   nonCompoundCount,
+                                                   compoundCount));
+                }
+              }
+              if (count == 100000000) {
+                System.out.println("  again...");
+                break;
+              }
+            }
+            ts.end();
+            netOffset += offsetAtt.endOffset();
+          }
+          System.out.println("compoundCount=" + compoundCount + " 
nonCompoundCount=" + nonCompoundCount);
+          r.close();
+        }
+        */
+
+
+        private void doTestBocchan(int numIterations)
+        {
+            TextReader reader = new StreamReader(
+                this.GetType().getResourceAsStream("bocchan.utf-8"), 
Encoding.UTF8);
+            String line = reader.ReadLine();
+            reader.Dispose();
+
+            if (VERBOSE)
+            {
+                Console.WriteLine("Test for Bocchan without pre-splitting 
sentences");
+            }
+
+            /*
+            if (numIterations > 1) {
+              // warmup
+              for (int i = 0; i < numIterations; i++) {
+                final TokenStream ts = analyzer.tokenStream("ignored", line);
+                ts.reset();
+                while(ts.incrementToken());
+              }
+            }
+            */
+
+            long totalStart = Environment.TickCount;
+            for (int i = 0; i < numIterations; i++)
+            {
+                TokenStream ts = analyzer.GetTokenStream("ignored", line);
+                try
+                {
+                    ts.Reset();
+                    while (ts.IncrementToken()) ;
+                    ts.End();
+                }
+                finally
+                {
+                    IOUtils.DisposeWhileHandlingException(ts);
+                }
+            }
+            String[] sentences = Regex.Split(line, "ã|ã");
+            if (VERBOSE)
+            {
+                Console.WriteLine("Total time : " + (Environment.TickCount - 
totalStart));
+                Console.WriteLine("Test for Bocchan with pre-splitting 
sentences (" + sentences.Length + " sentences)");
+            }
+            totalStart = Environment.TickCount;
+            for (int i = 0; i < numIterations; i++)
+            {
+                foreach (String sentence in sentences)
+                {
+                    TokenStream ts = analyzer.GetTokenStream("ignored", 
sentence);
+                    try
+                    {
+                        ts.Reset();
+                        while (ts.IncrementToken()) ;
+                        ts.End();
+                    }
+                    finally
+                    {
+                        IOUtils.DisposeWhileHandlingException(ts);
+                    }
+                }
+            }
+            if (VERBOSE)
+            {
+                Console.WriteLine("Total time : " + (Environment.TickCount - 
totalStart));
+            }
+        }
+
+        [Test]
+        public void TestWithPunctuation()
+        {
+            AssertAnalyzesTo(analyzerNoPunct, "ç¾½ç°ãç©ºæ¸¯",
+                             new String[] { "ç¾½ç°", "ç©ºæ¸¯" },
+                             new int[] { 1, 1 });
+        }
+
+        [Test]
+        public void TestCompoundOverPunctuation()
+        {
+            AssertAnalyzesToPositions(analyzerNoPunct, 
"dÎµÎµÏ¶Ï¢ÏÎÏ·ÎÍºç¾½ç°",
+                                      new String[] { "d", "Îµ", "Îµ", 
"Ï¢ÏÎÏ·ÎÍº", "ç¾½ç°" },
+                                      new int[] { 1, 1, 1, 1, 1 },
+                                      new int[] { 1, 1, 1, 1, 1 });
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizerFactory.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizerFactory.cs 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizerFactory.cs
new file mode 100644
index 0000000..91fbf16
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizerFactory.cs
@@ -0,0 +1,134 @@
+ï»¿using Lucene.Net.Support;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Simple tests for <see cref="JapaneseTokenizerFactory"/>
+    /// </summary>
+    public class TestJapaneseTokenizerFactory : BaseTokenStreamTestCase
+    {
+        [Test]
+        public void TestSimple()
+        {
+            JapaneseTokenizerFactory factory = new 
JapaneseTokenizerFactory(new Dictionary<String, String>());
+            factory.Inform(new StringMockResourceLoader(""));
+            TokenStream ts = factory.Create(new 
StringReader("ããã¯æ¬ã§ã¯ãªã"));
+            AssertTokenStreamContents(ts,
+                new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" 
},
+                new int[] { 0, 2, 3, 4, 5, 6 },
+                new int[] { 2, 3, 4, 5, 6, 8 }
+            );
+        }
+
+        /**
+         * Test that search mode is enabled and working by default
+         */
+        [Test]
+        public void TestDefaults()
+        {
+            JapaneseTokenizerFactory factory = new 
JapaneseTokenizerFactory(new Dictionary<String, String>());
+            factory.Inform(new StringMockResourceLoader(""));
+            TokenStream ts = factory.Create(new 
StringReader("ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢"));
+            AssertTokenStreamContents(ts,
+                new String[] { "ã·ãã¢", 
"ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢", "ã½ããã¦ã§ã¢", 
"ã¨ã³ã¸ãã¢" }
+            );
+        }
+
+        /**
+         * Test mode parameter: specifying normal mode
+         */
+        [Test]
+        public void TestMode()
+        {
+            IDictionary<String, String> args = new Dictionary<String, 
String>();
+            args.Put("mode", "normal");
+            JapaneseTokenizerFactory factory = new 
JapaneseTokenizerFactory(args);
+            factory.Inform(new StringMockResourceLoader(""));
+            TokenStream ts = factory.Create(new 
StringReader("ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢"));
+            AssertTokenStreamContents(ts,
+                new String[] { "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢" }
+            );
+        }
+
+        /**
+         * Test user dictionary
+         */
+        [Test]
+        public void TestUserDict()
+        {
+            String userDict =
+                "# Custom segmentation for long entries\n" +
+                "æ¥æ¬çµæ¸æ°è,æ¥æ¬ çµæ¸ æ°è,ããã³ 
ã±ã¤ã¶ã¤ ã·ã³ãã³,ã«ã¹ã¿ã åè©\n" +
+                "é¢è¥¿å½éç©ºæ¸¯,é¢è¥¿ å½é ç©ºæ¸¯,ã«ã³ãµã¤ 
ã³ã¯ãµã¤ ã¯ã¦ã³ã¦,ãã¹ãåè©\n" +
+                "# Custom reading for sumo wrestler\n" +
+                "æéé¾,æéé¾,ã¢ãµã·ã§ã¦ãªã¥ã¦,ã«ã¹ã¿ã 
äººå\n";
+            IDictionary<String, String> args = new Dictionary<String, 
String>();
+            args.Put("userDictionary", "userdict.txt");
+            JapaneseTokenizerFactory factory = new 
JapaneseTokenizerFactory(args);
+            factory.Inform(new StringMockResourceLoader(userDict));
+            TokenStream ts = factory.Create(new 
StringReader("é¢è¥¿å½éç©ºæ¸¯ã«è¡ã£ã"));
+            AssertTokenStreamContents(ts,
+                new String[] { "é¢è¥¿", "å½é", "ç©ºæ¸¯", "ã«", "è¡ã£", 
"ã" }
+            );
+        }
+
+        /**
+         * Test preserving punctuation
+         */
+        [Test]
+        public void TestPreservePunctuation()
+        {
+            IDictionary<String, String> args = new Dictionary<String, 
String>();
+            args.Put("discardPunctuation", "false");
+            JapaneseTokenizerFactory factory = new 
JapaneseTokenizerFactory(args);
+            factory.Inform(new StringMockResourceLoader(""));
+            TokenStream ts = factory.Create(
+                new 
StringReader("ä»ãã«ã¦ã§ã¼ã«ãã¾ãããæ¥é±ã®é 
æ¥æ¬ã«æ»ãã¾ããæ¥½ãã¿ã«ãã¦ãã¾ãï¼ãå¯¿å¸ãé£ã¹ãããªããã")
+            );
+            AssertTokenStreamContents(ts,
+                new String[] { "ä»", "ãã«ã¦ã§ã¼", "ã«", "ã", 
"ã¾ã", "ã", "ã",
+                    "æ¥é±", "ã®", "é ", "æ¥æ¬", "ã«", "æ»ã", 
"ã¾ã", "ã",
+                    "æ¥½ãã¿", "ã«", "ã", "ã¦", "ã", "ã¾ã", "ï¼",
+                    "ã", "å¯¿å¸", "ã", "é£ã¹", "ãã", "ãª", "ã", 
"ã", "ã" }
+            );
+        }
+
+        /** Test that bogus arguments result in exception */
+        [Test]
+        public void TestBogusArguments()
+        {
+            try
+            {
+                new JapaneseTokenizerFactory(new Dictionary<String, String>() {
+                    { "bogusArg", "bogusValue" }
+                });
+                fail();
+            }
+            catch (ArgumentException expected)
+            {
+                assertTrue(expected.Message.Contains("Unknown parameters"));
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestSearchMode.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestSearchMode.cs 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestSearchMode.cs
new file mode 100644
index 0000000..bb9fdae
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestSearchMode.cs
@@ -0,0 +1,92 @@
+ï»¿using NUnit.Framework;
+using System;
+using System.IO;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class TestSearchMode : BaseTokenStreamTestCase
+    {
+        private readonly static String SEGMENTATION_FILENAME = 
"search-segmentation-tests.txt";
+        private readonly Analyzer analyzer = 
Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+        {
+            Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, 
JapaneseTokenizerMode.SEARCH);
+            return new TokenStreamComponents(tokenizer, tokenizer);
+        });
+
+
+        /** Test search mode segmentation */
+        [Test]
+        public void TestSearchSegmentation()
+        {
+            Stream @is = 
typeof(TestSearchMode).getResourceAsStream(SEGMENTATION_FILENAME);
+            if (@is == null)
+            {
+                throw new FileNotFoundException("Cannot find " + 
SEGMENTATION_FILENAME + " in test classpath");
+            }
+            try
+            {
+                TextReader reader = new StreamReader(@is, Encoding.UTF8);
+                String line = null;
+                int lineNumber = 0;
+
+                while ((line = reader.ReadLine()) != null)
+                {
+                    lineNumber++;
+                    // Remove comments
+                    line = Regex.Replace(line, "#.*$", "");
+                    // Skip empty lines or comment lines
+                    if (line.Trim() == string.Empty)
+                    {
+                        continue;
+                    }
+                    if (VERBOSE)
+                    {
+                        Console.WriteLine("Line no. " + lineNumber + ": " + 
line);
+                    }
+                    String[] fields = new Regex("\t").Split(line, 2); // 
Regex.Split(line, "\t", 2);
+                    String sourceText = fields[0];
+                    String[] expectedTokens = Regex.Split(fields[1], "\\s+");
+                    int[] expectedPosIncrs = new int[expectedTokens.Length];
+                    int[] expectedPosLengths = new int[expectedTokens.Length];
+                    for (int tokIDX = 0; tokIDX < expectedTokens.Length; 
tokIDX++)
+                    {
+                        if (expectedTokens[tokIDX].EndsWith("/0", 
StringComparison.Ordinal))
+                        {
+                            expectedTokens[tokIDX] = 
Regex.Replace(expectedTokens[tokIDX], "/0", "");
+                            expectedPosLengths[tokIDX] = expectedTokens.Length 
- 1;
+                        }
+                        else
+                        {
+                            expectedPosIncrs[tokIDX] = 1;
+                            expectedPosLengths[tokIDX] = 1;
+                        }
+                    }
+                    AssertAnalyzesTo(analyzer, sourceText, expectedTokens, 
expectedPosIncrs);
+                }
+            }
+            finally
+            {
+                @is.Dispose();
+            }
+        }
+    }
+}

[04/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji + tests

Reply via email to