http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilter.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilter.cs new file mode 100644 index 0000000..609803f --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilter.cs @@ -0,0 +1,84 @@ +using Lucene.Net.Analysis.Core; +using Lucene.Net.Analysis.Miscellaneous; +using Lucene.Net.Analysis.Util; +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class TestJapaneseBaseFormFilter : BaseTokenStreamTestCase + { + private Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE); + return new TokenStreamComponents(tokenizer, new JapaneseBaseFormFilter(tokenizer)); + }); + + + [Test] + public void TestBasics() + { + AssertAnalyzesTo(analyzer, "ããã¯ã¾ã å®é¨æ®µéã«ããã¾ã", + new String[] { "ãã", "ã¯", "ã¾ã ", "å®é¨", "段é", "ã«", "ãã", "ã¾ã" } + ); + } + + [Test] + public void TestKeyword() + { + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("ãã"), false); + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer source = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE); + TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink)); + }); + + AssertAnalyzesTo(a, "ããã¯ã¾ã å®é¨æ®µéã«ããã¾ã", + new String[] { "ãã", "ã¯", "ã¾ã ", "å®é¨", "段é", "ã«", "ãã", "ã¾ã" } + ); + } + + [Test] + public void TestEnglish() + { + AssertAnalyzesTo(analyzer, "this atest", + new String[] { "this", "atest" }); + } + + [Test] + public void TestRandomStrings() + { + CheckRandomData(Random(), analyzer, AtLeast(1000)); + } + + [Test] + public void TestEmptyTerm() + { + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new JapaneseBaseFormFilter(tokenizer)); + }); + + CheckOneTerm(a, "", ""); + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilterFactory.cs new file mode 100644 index 0000000..61a8b2e --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilterFactory.cs @@ -0,0 +1,60 @@ +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Simple tests for <see cref="JapaneseBaseFormFilterFactory"/> + /// </summary> + public class TestJapaneseBaseFormFilterFactory : BaseTokenStreamTestCase + { + [Test] + public void TestBasics() + { + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>()); + tokenizerFactory.Inform(new StringMockResourceLoader("")); + TokenStream ts = tokenizerFactory.Create(new StringReader("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ã")); + JapaneseBaseFormFilterFactory factory = new JapaneseBaseFormFilterFactory(new Dictionary<String, String>()); + ts = factory.Create(ts); + AssertTokenStreamContents(ts, + new String[] { "ãã", "ã¯", "ã¾ã ", "å®é¨", "段é", "ã«", "ãã", "ã¾ã" } + ); + } + + /** Test that bogus arguments result in exception */ + [Test] + public void TestBogusArguments() + { + try + { + new JapaneseBaseFormFilterFactory(new Dictionary<String, String>() { + { "bogusArg", "bogusValue" } + }); + fail(); + } + catch (ArgumentException expected) + { + assertTrue(expected.Message.Contains("Unknown parameters")); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilter.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilter.cs new file mode 100644 index 0000000..9db0903 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilter.cs @@ -0,0 +1,241 @@ +using NUnit.Framework; +using System; +using System.IO; +using System.Text; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class TestJapaneseIterationMarkCharFilter : BaseTokenStreamTestCase + { + private Analyzer keywordAnalyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false); + return new TokenStreamComponents(tokenizer, tokenizer); + }, + initReader: (fieldName, reader) => + { + return new JapaneseIterationMarkCharFilter(reader); + }); + + + private Analyzer japaneseAnalyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new JapaneseTokenizer(reader, null, false, JapaneseTokenizerMode.SEARCH); + return new TokenStreamComponents(tokenizer, tokenizer); + }, + initReader: (fieldName, reader) => + { + return new JapaneseIterationMarkCharFilter(reader); + }); + + [Test] + public void TestKanji() + { + // Test single repetition + AssertAnalyzesTo(keywordAnalyzer, "æã ", new String[] { "ææ" }); + AssertAnalyzesTo(japaneseAnalyzer, "æã ", new String[] { "ææ" }); + + // Test multiple repetitions + AssertAnalyzesTo(keywordAnalyzer, "馬鹿ã ã ãã", new String[] { "馬鹿馬鹿ãã" }); + AssertAnalyzesTo(japaneseAnalyzer, "馬鹿ã ã ãã", new String[] { "馬鹿馬鹿ãã" }); + } + + [Test] + public void TestKatakana() + { + // Test single repetition + AssertAnalyzesTo(keywordAnalyzer, "ãã¹ã¾", new String[] { "ãã¹ãº" }); + AssertAnalyzesTo(japaneseAnalyzer, "ãã¹ã¾", new String[] { "ã", "ã¹ãº" }); // Side effect + } + + [Test] + public void testHiragana() + { + // Test single unvoiced iteration + AssertAnalyzesTo(keywordAnalyzer, "ããã®", new String[] { "ããã®" }); + AssertAnalyzesTo(japaneseAnalyzer, "ããã®", new String[] { "ã", "ãã®" }); // Side effect + + // Test single voiced iteration + AssertAnalyzesTo(keywordAnalyzer, "ã¿ãã", new String[] { "ã¿ãã" }); + AssertAnalyzesTo(japaneseAnalyzer, "ã¿ãã", new String[] { "ã¿ãã" }); + + // Test single voiced iteration + AssertAnalyzesTo(keywordAnalyzer, "ãã", new String[] { "ãã" }); + AssertAnalyzesTo(japaneseAnalyzer, "ãã", new String[] { "ãã" }); + + // Test single unvoiced iteration with voiced iteration + AssertAnalyzesTo(keywordAnalyzer, "ãã", new String[] { "ãã" }); + AssertAnalyzesTo(japaneseAnalyzer, "ãã", new String[] { "ãã" }); + + // Test multiple repetitions with voiced iteration + AssertAnalyzesTo(keywordAnalyzer, "ã¨ããããã", new String[] { "ã¨ããã©ãã" }); + AssertAnalyzesTo(japaneseAnalyzer, "ã¨ããããã", new String[] { "ã¨ããã©ãã" }); + } + + [Test] + public void TestMalformed() + { + // We can't iterate c here, so emit as it is + AssertAnalyzesTo(keywordAnalyzer, "abcã¨ãããããã", new String[] { "abcã¨ããcã¨ãã" }); + + // We can't iterate c (with dakuten change) here, so emit it as-is + AssertAnalyzesTo(keywordAnalyzer, "abcã¨ãããããã", new String[] { "abcã¨ããcã¨ãã" }); + + // We can't iterate before beginning of stream, so emit characters as-is + AssertAnalyzesTo(keywordAnalyzer, "ã¨ãããããããã", new String[] { "ã¨ããã©ããããã" }); + + // We can't iterate an iteration mark only, so emit as-is + AssertAnalyzesTo(keywordAnalyzer, "ã ", new String[] { "ã " }); + AssertAnalyzesTo(keywordAnalyzer, "ã", new String[] { "ã" }); + AssertAnalyzesTo(keywordAnalyzer, "ãã", new String[] { "ãã" }); + + // We can't iterate a full stop punctuation mark (because we use it as a flush marker) + AssertAnalyzesTo(keywordAnalyzer, "ãã", new String[] { "ãã" }); + AssertAnalyzesTo(keywordAnalyzer, "ãããã", new String[] { "ãããã" }); + + // We can iterate other punctuation marks + AssertAnalyzesTo(keywordAnalyzer, "ï¼ã", new String[] { "ï¼ï¼" }); + + // We can not get a dakuten variant of ã½ -- this is also a corner case test for inside() + AssertAnalyzesTo(keywordAnalyzer, "ããã½ãã¤ãã´", new String[] { "ããã½ã½ã¤ãã´" }); + AssertAnalyzesTo(keywordAnalyzer, "ããã½ãã¤ãã´", new String[] { "ããã½ã½ã¤ãã´" }); + } + + [Test] + public void TestEmpty() + { + // Empty input stays empty + AssertAnalyzesTo(keywordAnalyzer, "", new String[0]); + AssertAnalyzesTo(japaneseAnalyzer, "", new String[0]); + } + + [Test] + public void TestFullStop() + { + // Test full stops + AssertAnalyzesTo(keywordAnalyzer, "ã", new String[] { "ã" }); + AssertAnalyzesTo(keywordAnalyzer, "ãã", new String[] { "ãã" }); + AssertAnalyzesTo(keywordAnalyzer, "ããã", new String[] { "ããã" }); + } + + [Test] + public void TestKanjiOnly() + { + // Test kanji only repetition marks + CharFilter filter = new JapaneseIterationMarkCharFilter( + new StringReader("æã ãããã®ããã¨ä¸ç·ã«ã寿å¸ãé£ã¹ããã§ããabcã¨ãããããã"), + true, // kanji + false // no kana + ); + assertCharFilterEquals(filter, "ææãããã®ããã¨ä¸ç·ã«ã寿å¸ãé£ã¹ããã§ããabcã¨ãããããã"); + } + + [Test] + public void TestKanaOnly() + { + // Test kana only repetition marks + CharFilter filter = new JapaneseIterationMarkCharFilter( + new StringReader("æã ãããã®ããã¨ä¸ç·ã«ã寿å¸ãé£ã¹ããã§ããabcã¨ãããããã"), + false, // no kanji + true // kana + ); + assertCharFilterEquals(filter, "æã ãããã®ããã¨ä¸ç·ã«ã寿å¸ãé£ã¹ããã§ããabcã¨ããã©ããã"); + } + + [Test] + public void TestNone() + { + // Test no repetition marks + CharFilter filter = new JapaneseIterationMarkCharFilter( + new StringReader("æã ãããã®ããã¨ä¸ç·ã«ã寿å¸ãé£ã¹ããã§ããabcã¨ãããããã"), + false, // no kanji + false // no kana + ); + assertCharFilterEquals(filter, "æã ãããã®ããã¨ä¸ç·ã«ã寿å¸ãé£ã¹ããã§ããabcã¨ãããããã"); + } + + [Test] + public void TestCombinations() + { + AssertAnalyzesTo(keywordAnalyzer, "æã ãããã®ããã¨ä¸ç·ã«ã寿å¸ãé£ã¹ã«è¡ãã¾ãã", + new String[] { "ææãããã®ããã¨ä¸ç·ã«ã寿å¸ãé£ã¹ã«è¡ãã¾ãã" } + ); + } + + [Test] + public void TestHiraganaCoverage() + { + // Test all hiragana iteration variants + String source = "ããããããããããããããããããããããããããããããããããããããããããã ãã¡ãã¢ãã¤ãã¥ãã¦ãã§ãã¨ãã©ãã¯ãã°ãã²ãã³ããµãã¶ãã¸ãã¹ãã»ãã¼ã"; + String target = "ããããããããããããããããããããããããããããããããããããããããããã ãã¡ã¡ã¢ã¡ã¤ã¤ã¥ã¤ã¦ã¦ã§ã¦ã¨ã¨ã©ã¨ã¯ã¯ã°ã¯ã²ã²ã³ã²ãµãµã¶ãµã¸ã¸ã¹ã¸ã»ã»ã¼ã»"; + AssertAnalyzesTo(keywordAnalyzer, source, new String[] { target }); + + // Test all hiragana iteration variants with dakuten + source = "ããããããããããããããããããããããããããããããããããããããããããã ãã¡ãã¢ãã¤ãã¥ãã¦ãã§ãã¨ãã©ãã¯ãã°ãã²ãã³ããµãã¶ãã¸ãã¹ãã»ãã¼ã"; + target = "ãããããããããããããããããããããããããããããããããããããããããã ã ã ã¡ã¢ã¢ã¢ã¤ã¥ã¥ã¥ã¦ã§ã§ã§ã¨ã©ã©ã©ã¯ã°ã°ã°ã²ã³ã³ã³ãµã¶ã¶ã¶ã¸ã¹ã¹ã¹ã»ã¼ã¼ã¼"; + AssertAnalyzesTo(keywordAnalyzer, source, new String[] { target }); + } + + [Test] + public void TestKatakanaCoverage() + { + // Test all katakana iteration variants + String source = "ã«ã½ã¬ã½ãã½ã®ã½ã¯ã½ã°ã½ã±ã½ã²ã½ã³ã½ã´ã½ãµã½ã¶ã½ã·ã½ã¸ã½ã¹ã½ãºã½ã»ã½ã¼ã½ã½ã½ã¾ã½ã¿ã½ãã½ãã½ãã½ãã½ã ã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½ãã½"; + String target = "ã«ã«ã¬ã«ããã®ãã¯ã¯ã°ã¯ã±ã±ã²ã±ã³ã³ã´ã³ãµãµã¶ãµã·ã·ã¸ã·ã¹ã¹ãºã¹ã»ã»ã¼ã»ã½ã½ã¾ã½ã¿ã¿ãã¿ããããããã ããããããããããããããããããããããããããããã"; + AssertAnalyzesTo(keywordAnalyzer, source, new String[] { target }); + + // Test all katakana iteration variants with dakuten + source = "ã«ã¾ã¬ã¾ãã¾ã®ã¾ã¯ã¾ã°ã¾ã±ã¾ã²ã¾ã³ã¾ã´ã¾ãµã¾ã¶ã¾ã·ã¾ã¸ã¾ã¹ã¾ãºã¾ã»ã¾ã¼ã¾ã½ã¾ã¾ã¾ã¿ã¾ãã¾ãã¾ãã¾ãã¾ã ã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾ãã¾"; + target = "ã«ã¬ã¬ã¬ãã®ã®ã®ã¯ã°ã°ã°ã±ã²ã²ã²ã³ã´ã´ã´ãµã¶ã¶ã¶ã·ã¸ã¸ã¸ã¹ãºãºãºã»ã¼ã¼ã¼ã½ã¾ã¾ã¾ã¿ããããããããã ã ã ãããããããããããããããããããããããããããã"; + AssertAnalyzesTo(keywordAnalyzer, source, new String[] { target }); + } + + [Test] + public void TestRandomStrings() + { + // Blast some random strings through + CheckRandomData(Random(), keywordAnalyzer, 1000 * RANDOM_MULTIPLIER); + } + + [Test] + public void TestRandomHugeStrings() + { + // Blast some random strings through + CheckRandomData(Random(), keywordAnalyzer, 100 * RANDOM_MULTIPLIER, 8192); + } + + private void assertCharFilterEquals(CharFilter filter, String expected) + { + String actual = readFully(filter); + assertEquals(expected, actual); + } + + private String readFully(TextReader stream) + { + StringBuilder buffer = new StringBuilder(); + int ch; + while ((ch = stream.Read()) != -1) + { + buffer.append((char)ch); + } + return buffer.toString(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilterFactory.cs new file mode 100644 index 0000000..88f71a9 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilterFactory.cs @@ -0,0 +1,108 @@ +using Lucene.Net.Support; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Simple tests for <see cref="JapaneseIterationMarkCharFilterFactory"/> + /// </summary> + public class TestJapaneseIterationMarkCharFilterFactory : BaseTokenStreamTestCase + { + [Test] + public void TestIterationMarksWithKeywordTokenizer() + { + String text = "æã 馬鹿ã ã ããã¨ããããããã¹ã¾"; + JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new Dictionary<String, String>()); + TextReader filter = filterFactory.Create(new StringReader(text)); + TokenStream tokenStream = new MockTokenizer(filter, MockTokenizer.KEYWORD, false); + AssertTokenStreamContents(tokenStream, new String[] { "ææé¦¬é¹¿é¦¬é¹¿ããã¨ããã©ãããã¹ãº" }); + } + + [Test] + public void TestIterationMarksWithJapaneseTokenizer() + { + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>()); + tokenizerFactory.Inform(new StringMockResourceLoader("")); + + JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new Dictionary<String, String>()); + TextReader filter = filterFactory.Create( + new StringReader("æã 馬鹿ã ã ããã¨ããããããã¹ã¾") + ); + TokenStream tokenStream = tokenizerFactory.Create(filter); + AssertTokenStreamContents(tokenStream, new String[] { "ææ", "馬鹿馬鹿ãã", "ã¨ããã©ãã", "ã", "ã¹ãº" }); + } + + [Test] + public void TestKanjiOnlyIterationMarksWithJapaneseTokenizer() + { + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>()); + tokenizerFactory.Inform(new StringMockResourceLoader("")); + + IDictionary<String, String> filterArgs = new Dictionary<String, String>(); + filterArgs.Put("normalizeKanji", "true"); + filterArgs.Put("normalizeKana", "false"); + JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs); + + TextReader filter = filterFactory.Create( + new StringReader("æã 馬鹿ã ã ããã¨ããããããã¹ã¾") + ); + TokenStream tokenStream = tokenizerFactory.Create(filter); + AssertTokenStreamContents(tokenStream, new String[] { "ææ", "馬鹿馬鹿ãã", "ã¨ãã", "ã", "ã", "ã", "ãã¹", "ã¾" }); + } + + [Test] + public void TestKanaOnlyIterationMarksWithJapaneseTokenizer() + { + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>()); + tokenizerFactory.Inform(new StringMockResourceLoader("")); + + IDictionary<String, String> filterArgs = new Dictionary<String, String>(); + filterArgs.Put("normalizeKanji", "false"); + filterArgs.Put("normalizeKana", "true"); + JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs); + + TextReader filter = filterFactory.Create( + new StringReader("æã 馬鹿ã ã ããã¨ããããããã¹ã¾") + ); + TokenStream tokenStream = tokenizerFactory.Create(filter); + AssertTokenStreamContents(tokenStream, new String[] { "æã ", "馬鹿", "ã ", "ã ", "ãã", "ã¨ããã©ãã", "ã", "ã¹ãº" }); + } + + /** Test that bogus arguments result in exception */ + [Test] + public void TestBogusArguments() + { + try + { + new JapaneseIterationMarkCharFilterFactory(new Dictionary<String, String>() { + { "bogusArg", "bogusValue" } + }); + fail(); + } + catch (ArgumentException expected) + { + assertTrue(expected.Message.Contains("Unknown parameters")); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilter.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilter.cs new file mode 100644 index 0000000..cbbc95b --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilter.cs @@ -0,0 +1,100 @@ +using Lucene.Net.Analysis.Core; +using Lucene.Net.Analysis.Miscellaneous; +using Lucene.Net.Analysis.Util; +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests for <see cref="JapaneseKatakanaStemFilter"/> + /// </summary> + public class TestJapaneseKatakanaStemFilter : BaseTokenStreamTestCase + { + private Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + // Use a MockTokenizer here since this filter doesn't really depend on Kuromoji + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(source)); + }); + + /** + * Test a few common katakana spelling variations. + * <p> + * English translations are as follows: + * <ul> + * <li>copy</li> + * <li>coffee</li> + * <li>taxi</li> + * <li>party</li> + * <li>party (without long sound)</li> + * <li>center</li> + * </ul> + * Note that we remove a long sound in the case of "coffee" that is required. + * </p> + */ + [Test] + public void TestStemVariants() + { + AssertAnalyzesTo(analyzer, "ã³ãã¼ ã³ã¼ãã¼ ã¿ã¯ã·ã¼ ãã¼ãã£ã¼ ãã¼ã㣠ã»ã³ã¿ã¼", + new String[] { "ã³ãã¼", "ã³ã¼ã", "ã¿ã¯ã·", "ãã¼ãã£", "ãã¼ãã£", "ã»ã³ã¿" }, + new int[] { 0, 4, 9, 14, 20, 25 }, + new int[] { 3, 8, 13, 19, 24, 29 }); + } + + [Test] + public void TestKeyword() + { + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("ã³ã¼ãã¼"), false); + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink)); + }); + CheckOneTerm(a, "ã³ã¼ãã¼", "ã³ã¼ãã¼"); + } + + [Test] + public void TestUnsupportedHalfWidthVariants() + { + // The below result is expected since only full-width katakana is supported + AssertAnalyzesTo(analyzer, "ï¾ï½¸ï½¼ï½°", new String[] { "ï¾ï½¸ï½¼ï½°" }); + } + + [Test] + public void TestRandomData() + { + CheckRandomData(Random(), analyzer, 1000 * RANDOM_MULTIPLIER); + } + + [Test] + public void TestEmptyTerm() + { + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new JapaneseKatakanaStemFilter(tokenizer)); + }); + + CheckOneTerm(a, "", ""); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilterFactory.cs new file mode 100644 index 0000000..49ac181 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilterFactory.cs @@ -0,0 +1,62 @@ +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Simple tests for <see cref="JapaneseKatakanaStemFilterFactory"/> + /// </summary> + public class TestJapaneseKatakanaStemFilterFactory : BaseTokenStreamTestCase + { + [Test] + public void TestKatakanaStemming() + { + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>()); + tokenizerFactory.Inform(new StringMockResourceLoader("")); + TokenStream tokenStream = tokenizerFactory.Create( + new StringReader("æå¾æ¥ãã¼ãã£ã¼ã«è¡ãäºå®ãããã峿¸é¤¨ã§è³æãã³ãã¼ãã¾ããã") + ); + JapaneseKatakanaStemFilterFactory filterFactory = new JapaneseKatakanaStemFilterFactory(new Dictionary<String, String>()); ; + AssertTokenStreamContents(filterFactory.Create(tokenStream), + new String[]{ "æå¾æ¥", "ãã¼ãã£", "ã«", "è¡ã", "äºå®", "ã", "ãã", // ãã¼ãã£ã¼ should be stemmed + "峿¸é¤¨", "ã§", "è³æ", "ã", "ã³ãã¼", "ã", "ã¾ã", "ã"} // ã³ãã¼ should not be stemmed + ); + } + + /** Test that bogus arguments result in exception */ + [Test] + public void TestBogusArguments() + { + try + { + new JapaneseKatakanaStemFilterFactory(new Dictionary<String, String>() { + { "bogusArg", "bogusValue" } + }); + fail(); + } + catch (ArgumentException expected) + { + assertTrue(expected.Message.Contains("Unknown parameters")); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapanesePartOfSpeechStopFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapanesePartOfSpeechStopFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapanesePartOfSpeechStopFilterFactory.cs new file mode 100644 index 0000000..617a1b8 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapanesePartOfSpeechStopFilterFactory.cs @@ -0,0 +1,70 @@ +using Lucene.Net.Support; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Simple tests for <see cref="JapanesePartOfSpeechStopFilterFactory"/> + /// </summary> + public class TestJapanesePartOfSpeechStopFilterFactory : BaseTokenStreamTestCase + { + [Test] + public void TestBasics() + { + String tags = + "# verb-main:\n" + + "åè©-èªç«\n"; + + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>()); + tokenizerFactory.Inform(new StringMockResourceLoader("")); + TokenStream ts = tokenizerFactory.Create(new StringReader("ç§ã¯å¶éã¹ãã¼ããè¶ ããã")); + IDictionary<String, String> args = new Dictionary<String, String>(); + args.Put("luceneMatchVersion", TEST_VERSION_CURRENT.toString()); + args.Put("tags", "stoptags.txt"); + JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory(args); + factory.Inform(new StringMockResourceLoader(tags)); + ts = factory.Create(ts); + AssertTokenStreamContents(ts, + new String[] { "ç§", "ã¯", "å¶é", "ã¹ãã¼ã", "ã" } + ); + } + + /** Test that bogus arguments result in exception */ + [Test] + public void TestBogusArguments() + { + try + { + new JapanesePartOfSpeechStopFilterFactory(new Dictionary<String, String>() { + { "luceneMatchVersion", TEST_VERSION_CURRENT.toString() }, + { "bogusArg", "bogusValue" } + }); + fail(); + } + catch (ArgumentException expected) + { + assertTrue(expected.Message.Contains("Unknown parameters")); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilter.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilter.cs new file mode 100644 index 0000000..141db33 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilter.cs @@ -0,0 +1,109 @@ +using Lucene.Net.Analysis.Cjk; +using Lucene.Net.Analysis.Core; +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests for <see cref="TestJapaneseReadingFormFilter"/> + /// </summary> + public class TestJapaneseReadingFormFilter : BaseTokenStreamTestCase + { + private Analyzer katakanaAnalyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizerMode.SEARCH); + return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(tokenizer, false)); + }); + + private Analyzer romajiAnalyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizerMode.SEARCH); + return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(tokenizer, true)); + }); + + + [Test] + public void TestKatakanaReadings() + { + AssertAnalyzesTo(katakanaAnalyzer, "ä»å¤ã¯ããã¼ãå çã¨è©±ãã", + new String[] { "ã³ã³ã¤", "ã", "ããã¼ã", "ã»ã³ã»ã¤", "ã", "ããã·", "ã¿" } + ); + } + + [Test] + public void TestKatakanaReadingsHalfWidth() + { + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizerMode.SEARCH); + TokenStream stream = new CJKWidthFilter(tokenizer); + return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, false)); + }); + + AssertAnalyzesTo(a, "ä»å¤ã¯ï¾ï¾ï¾ï½°ï¾å çã¨è©±ãã", + new String[] { "ã³ã³ã¤", "ã", "ããã¼ã", "ã»ã³ã»ã¤", "ã", "ããã·", "ã¿" } + ); + } + + [Test] + public void TestRomajiReadings() + { + AssertAnalyzesTo(romajiAnalyzer, "ä»å¤ã¯ããã¼ãå çã¨è©±ãã", + new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" } + ); + } + + [Test] + public void TestRomajiReadingsHalfWidth() + { + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizerMode.SEARCH); + TokenStream stream = new CJKWidthFilter(tokenizer); + return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, true)); + }); + + AssertAnalyzesTo(a, "ä»å¤ã¯ï¾ï¾ï¾ï½°ï¾å çã¨è©±ãã", + new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" } + ); + } + + [Test] + public void TestRandomData() + { + Random random = Random(); + CheckRandomData(random, katakanaAnalyzer, 1000 * RANDOM_MULTIPLIER); + CheckRandomData(random, romajiAnalyzer, 1000 * RANDOM_MULTIPLIER); + } + + [Test] + public void TestEmptyTerm() + { + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(tokenizer)); + }); + + CheckOneTerm(a, "", ""); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilterFactory.cs new file mode 100644 index 0000000..053652b --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilterFactory.cs @@ -0,0 +1,59 @@ +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Simple tests for <see cref="JapaneseReadingFormFilterFactory"/> + /// </summary> + public class TestJapaneseReadingFormFilterFactory : BaseTokenStreamTestCase + { + [Test] + public void TestReadings() + { + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>()); + tokenizerFactory.Inform(new StringMockResourceLoader("")); + TokenStream tokenStream = tokenizerFactory.Create(new StringReader("å ã»ã©ãã«ãªã³ããæ¥ã¾ããã")); + JapaneseReadingFormFilterFactory filterFactory = new JapaneseReadingFormFilterFactory(new Dictionary<String, String>()); + AssertTokenStreamContents(filterFactory.Create(tokenStream), + new String[] { "ãµã", "ãã", "ãã«ãªã³", "ã«ã©", "ã", "ãã·", "ã¿" } + ); + } + + /** Test that bogus arguments result in exception */ + [Test] + public void TestBogusArguments() + { + try + { + new JapaneseReadingFormFilterFactory(new Dictionary<String, String>() { + { "bogusArg", "bogusValue" } + }); + fail(); + } + catch (ArgumentException expected) + { + assertTrue(expected.Message.Contains("Unknown parameters")); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs new file mode 100644 index 0000000..0a1f819 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs @@ -0,0 +1,846 @@ +using Lucene.Net.Analysis.Ja.Dict; +using Lucene.Net.Analysis.Ja.TokenAttributes; +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Attributes; +using Lucene.Net.Util; +using NUnit.Framework; +using System; +using System.IO; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class TestJapaneseTokenizer : BaseTokenStreamTestCase + { + public static UserDictionary ReadDict() + { + Stream @is = typeof(TestJapaneseTokenizer).getResourceAsStream("userdict.txt"); + if (@is == null) + { + throw new Exception("Cannot find userdict.txt in test classpath!"); + } + try + { + try + { + TextReader reader = new StreamReader(@is, Encoding.UTF8); + return new UserDictionary(reader); + } + finally + { + @is.Dispose(); + } + } + catch (IOException ioe) + { + throw new Exception(ioe.ToString(), ioe); + } + } + + private Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.SEARCH); + return new TokenStreamComponents(tokenizer, tokenizer); + }); + + + private Analyzer analyzerNormal = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.NORMAL); + return new TokenStreamComponents(tokenizer, tokenizer); + }); + + private Analyzer analyzerNoPunct = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), true, JapaneseTokenizerMode.SEARCH); + return new TokenStreamComponents(tokenizer, tokenizer); + }); + + + private Analyzer extendedModeAnalyzerNoPunct = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), true, JapaneseTokenizerMode.EXTENDED); + return new TokenStreamComponents(tokenizer, tokenizer); + }); + + + [Test] + public void TestNormalMode() + { + AssertAnalyzesTo(analyzerNormal, + "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢", + new String[] { "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢" }); + } + + [Test] + public void TestDecomposition1() + { + AssertAnalyzesTo(analyzerNoPunct, "æ¬æ¥ã¯ãè²§å°å±¤ã®å¥³æ§ãåä¾ã«å»çä¿è·ãæä¾ããããã«åµè¨ãããå¶åº¦ã§ããã" + + "ã¢ã¡ãªã«ä½æå¾è å»çæ´å©å¶åº¦ãã仿¥ã§ã¯ããã®äºç®ã®ç´ï¼åã®ï¼ãè人ã«è²»ããã¦ããã", + new String[] { "æ¬æ¥", "ã¯", "è²§å°", "層", "ã®", "女æ§", "ã", "åä¾", "ã«", "å»ç", "ä¿è·", "ã", + "æä¾", "ãã", "ãã", "ã«", "åµè¨", "ã", "ã", "ã", "å¶åº¦", "ã§", "ãã", "ã¢ã¡ãªã«", + "ä½", "æå¾", "è ", "å»ç", "æ´å©", "å¶åº¦", "ã", "仿¥", "ã§", "ã¯", "ãã®", + "äºç®", "ã®", "ç´", "ï¼", "åã®", "ï¼", "ã", "è人", "ã«", "è²»ãã", "ã¦", "ãã" }, + new int[] { 0, 2, 4, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, + 31, 33, 34, 37, 41, 42, 44, 45, 47, 49, 51, 53, 55, 56, 58, 60, + 62, 63, 64, 65, 67, 68, 69, 71, 72, 75, 76 }, + new int[] { 2, 3, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, 31, + 33, 34, 36, 41, 42, 44, 45, 47, 49, 51, 52, 55, 56, 57, 60, 62, + 63, 64, 65, 67, 68, 69, 71, 72, 75, 76, 78 } + ); + } + + [Test] + public void TestDecomposition2() + { + AssertAnalyzesTo(analyzerNoPunct, "麻è¬ã®å¯å£²ã¯æ ¹ãããçµ¶ãããªããã°ãªããªã", + new String[] { "麻è¬", "ã®", "å¯å£²", "ã¯", "æ ¹ããã", "çµ¶ãã", "ãªãã", "ã°", "ãªã", "ãªã" }, + new int[] { 0, 2, 3, 5, 6, 10, 13, 16, 17, 19 }, + new int[] { 2, 3, 5, 6, 10, 13, 16, 17, 19, 21 } + ); + } + + [Test] + public void TestDecomposition3() + { + AssertAnalyzesTo(analyzerNoPunct, "é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã", + new String[] { "é女", "ç©", "大å°", "ãã·ã¥ã¼", "ãããã³ã¹" }, + new int[] { 0, 2, 3, 5, 10 }, + new int[] { 2, 3, 5, 9, 15 } + ); + } + + [Test] + public void TestDecomposition4() + { + AssertAnalyzesTo(analyzer, "ããã¯æ¬ã§ã¯ãªã", + new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" }, + new int[] { 0, 2, 3, 4, 5, 6 }, + new int[] { 2, 3, 4, 5, 6, 8 } + ); + } + + /* Note this is really a stupid test just to see if things arent horribly slow. + * ideally the test would actually fail instead of hanging... + */ + [Test] + public void TestDecomposition5() + { + TokenStream ts = analyzer.GetTokenStream("bogus", "ãããããããããããããããããããããããããããããããããããããããã"); + try + { + ts.Reset(); + while (ts.IncrementToken()) + { + + } + ts.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(ts); + } + } + + /* + // NOTE: intentionally fails! Just trying to debug this + // one input... + public void testDecomposition6() throws Exception { + assertAnalyzesTo(analyzer, "å¥è¯å 端ç§å¦æè¡å¤§å¦é¢å¤§å¦", + new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" }, + new int[] { 0, 2, 3, 4, 5, 6 }, + new int[] { 2, 3, 4, 5, 6, 8 } + ); + } + */ + + /** Tests that sentence offset is incorporated into the resulting offsets */ + [Test] + public void TestTwoSentences() + { + /* + //TokenStream ts = a.tokenStream("foo", "妹ã®å²åã§ãã俺ã¨å¹´åã§ãä»åé¨çã§ãã"); + TokenStream ts = analyzer.tokenStream("foo", "�<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"); + ts.reset(); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + while(ts.incrementToken()) { + System.out.println(" " + termAtt.toString()); + } + System.out.println("DONE PARSE\n\n"); + */ + + AssertAnalyzesTo(analyzerNoPunct, "é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã", + new String[] { "é女", "ç©", "大å°", "ãã·ã¥ã¼", "ãããã³ã¹", "é女", "ç©", "大å°", "ãã·ã¥ã¼", "ãããã³ã¹" }, + new int[] { 0, 2, 3, 5, 10, 17, 19, 20, 22, 27 }, + new int[] { 2, 3, 5, 9, 15, 19, 20, 22, 26, 32 } + ); + } + + /** blast some random strings through the analyzer */ + [Test] + public void TestRandomStrings() + { + CheckRandomData(Random(), analyzer, 1000 * RANDOM_MULTIPLIER); + CheckRandomData(Random(), analyzerNoPunct, 1000 * RANDOM_MULTIPLIER); + } + + /** blast some random large strings through the analyzer */ + [Test] + public void TestRandomHugeStrings() + { + Random random = Random(); + CheckRandomData(random, analyzer, 100 * RANDOM_MULTIPLIER, 8192); + CheckRandomData(random, analyzerNoPunct, 100 * RANDOM_MULTIPLIER, 8192); + } + + [Test] + public void TestRandomHugeStringsMockGraphAfter() + { + // Randomly inject graph tokens after JapaneseTokenizer: + Random random = Random(); + CheckRandomData(random, + Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.SEARCH); + TokenStream graph = new MockGraphTokenFilter(Random(), tokenizer); + return new TokenStreamComponents(tokenizer, graph); + }), + 100 * RANDOM_MULTIPLIER, 8192); + } + + [Test] + public void TestLargeDocReliability() + { + for (int i = 0; i < 100; i++) + { + String s = TestUtil.RandomUnicodeString(Random(), 10000); + TokenStream ts = analyzer.GetTokenStream("foo", s); + try + { + ts.Reset(); + while (ts.IncrementToken()) + { + } + ts.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(ts); + } + } + } + + /** simple test for supplementary characters */ + [Test] + public void TestSurrogates() + { + AssertAnalyzesTo(analyzer, "𩬠è±é乿¯ç", + new String[] { "𩬠", "è±", "é", "ä¹", "æ¯", "ç" }); + } + + /** random test ensuring we don't ever split supplementaries */ + [Test] + public void TestSurrogates2() + { + int numIterations = AtLeast(10000); + for (int i = 0; i < numIterations; i++) + { + if (VERBOSE) + { + Console.WriteLine("\nTEST: iter=" + i); + } + String s = TestUtil.RandomUnicodeString(Random(), 100); + TokenStream ts = analyzer.GetTokenStream("foo", s); + try + { + ICharTermAttribute termAtt = ts.AddAttribute<ICharTermAttribute>(); + ts.Reset(); + while (ts.IncrementToken()) + { + assertTrue(UnicodeUtil.ValidUTF16String(termAtt)); + } + ts.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(ts); + } + } + } + + [Test] + public void TestOnlyPunctuation() + { + TokenStream ts = analyzerNoPunct.GetTokenStream("foo", "ãããã"); + try + { + ts.Reset(); + assertFalse(ts.IncrementToken()); + ts.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(ts); + } + } + + [Test] + public void TestOnlyPunctuationExtended() + { + TokenStream ts = extendedModeAnalyzerNoPunct.GetTokenStream("foo", "......"); + try + { + ts.Reset(); + assertFalse(ts.IncrementToken()); + ts.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(ts); + } + } + + // note: test is kinda silly since kuromoji emits punctuation tokens. + // but, when/if we filter these out it will be useful. + [Test] + public void TestEnd() + { + AssertTokenStreamContents(analyzerNoPunct.GetTokenStream("foo", "ããã¯æ¬ã§ã¯ãªã"), + new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" }, + new int[] { 0, 2, 3, 4, 5, 6 }, + new int[] { 2, 3, 4, 5, 6, 8 }, + new int?(8) + ); + + AssertTokenStreamContents(analyzerNoPunct.GetTokenStream("foo", "ããã¯æ¬ã§ã¯ãªã "), + new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" }, + new int[] { 0, 2, 3, 4, 5, 6, 8 }, + new int[] { 2, 3, 4, 5, 6, 8, 9 }, + new int?(12) + ); + } + + [Test] + public void TestUserDict() + { + // Not a great test because w/o userdict.txt the + // segmentation is the same: + AssertTokenStreamContents(analyzer.GetTokenStream("foo", "é¢è¥¿å½é空港ã«è¡ã£ã"), + new String[] { "é¢è¥¿", "å½é", "空港", "ã«", "è¡ã£", "ã" }, + new int[] { 0, 2, 4, 6, 7, 9 }, + new int[] { 2, 4, 6, 7, 9, 10 }, + new int?(10) + ); + } + + [Test] + public void TestUserDict2() + { + // Better test: w/o userdict the segmentation is different: + AssertTokenStreamContents(analyzer.GetTokenStream("foo", "æéé¾"), + new String[] { "æéé¾" }, + new int[] { 0 }, + new int[] { 3 }, + new int?(3) + ); + } + + [Test] + public void TestUserDict3() + { + // Test entry that breaks into multiple tokens: + AssertTokenStreamContents(analyzer.GetTokenStream("foo", "abcd"), + new String[] { "a", "b", "cd" }, + new int[] { 0, 1, 2 }, + new int[] { 1, 2, 4 }, + new int?(4) + ); + } + + // HMM: fails (segments as a/b/cd/efghij)... because the + // two paths have exactly equal paths (1 KNOWN + 1 + // UNKNOWN) and we don't seem to favor longer KNOWN / + // shorter UNKNOWN matches: + + /* + public void testUserDict4() { + // Test entry that has another entry as prefix + assertTokenStreamContents(analyzer.tokenStream("foo", "abcdefghij"), + new String[] { "ab", "cd", "efg", "hij" }, + new int[] { 0, 2, 4, 7 }, + new int[] { 2, 4, 7, 10 }, + new int?(10) + ); + } + */ + + [Test] + public void TestSegmentation() + { + // Skip tests for Michelle Kwan -- UniDic segments Kwan as 㯠ã¯ã³ + // String input = "ãã·ã§ã«ã»ã¯ã¯ã³ãåªåãã¾ãããã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã"; + // String[] surfaceForms = { + // "ãã·ã§ã«", "ã»", "ã¯ã¯ã³", "ã", "åªå", "ã", "ã¾ã", "ã", "ã", + // "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", "ã¾ã", "ã", + // "ãããããã", "ã" + // }; + String input = "ã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã"; + String[] + surfaceForms = { + "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", "ã¾ã", "ã", + "ãããããã", "ã" + }; + AssertAnalyzesTo(analyzer, + input, + surfaceForms); + } + + [Test] + public void TestLatticeToDot() + { + GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.GetInstance()); + Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + JapaneseTokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.SEARCH) + { + GraphvizFormatter = gv2 + }; + return new TokenStreamComponents(tokenizer, tokenizer); + }); + + + String input = "ã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã"; + String[] surfaceForms = { + "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", "ã¾ã", "ã", + "ãããããã", "ã" + }; + AssertAnalyzesTo(analyzer, + input, + surfaceForms); + + + assertTrue(gv2.Finish().IndexOf("22.0") != -1); + } + + private void assertReadings(String input, params String[] readings) + { + TokenStream ts = analyzer.GetTokenStream("ignored", input); + try + { + IReadingAttribute readingAtt = ts.AddAttribute<IReadingAttribute>(); + ts.Reset(); + foreach (String reading in readings) + { + assertTrue(ts.IncrementToken()); + assertEquals(reading, readingAtt.GetReading()); + } + assertFalse(ts.IncrementToken()); + ts.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(ts); + } + } + + private void assertPronunciations(String input, params String[] pronunciations) + { + TokenStream ts = analyzer.GetTokenStream("ignored", input); + try + { + IReadingAttribute readingAtt = ts.AddAttribute<IReadingAttribute>(); + ts.Reset(); + foreach (String pronunciation in pronunciations) + { + assertTrue(ts.IncrementToken()); + assertEquals(pronunciation, readingAtt.GetPronunciation()); + } + assertFalse(ts.IncrementToken()); + ts.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(ts); + } + } + + private void assertBaseForms(String input, params String[] baseForms) + { + TokenStream ts = analyzer.GetTokenStream("ignored", input); + try + { + IBaseFormAttribute baseFormAtt = ts.AddAttribute<IBaseFormAttribute>(); + ts.Reset(); + foreach (String baseForm in baseForms) + { + assertTrue(ts.IncrementToken()); + assertEquals(baseForm, baseFormAtt.GetBaseForm()); + } + assertFalse(ts.IncrementToken()); + ts.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(ts); + } + } + + private void assertInflectionTypes(String input, params String[] inflectionTypes) + { + TokenStream ts = analyzer.GetTokenStream("ignored", input); + try + { + IInflectionAttribute inflectionAtt = ts.AddAttribute<IInflectionAttribute>(); + ts.Reset(); + foreach (String inflectionType in inflectionTypes) + { + assertTrue(ts.IncrementToken()); + assertEquals(inflectionType, inflectionAtt.GetInflectionType()); + } + assertFalse(ts.IncrementToken()); + ts.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(ts); + } + } + + private void assertInflectionForms(String input, params String[] inflectionForms) + { + TokenStream ts = analyzer.GetTokenStream("ignored", input); + try + { + IInflectionAttribute inflectionAtt = ts.AddAttribute<IInflectionAttribute>(); + ts.Reset(); + foreach (String inflectionForm in inflectionForms) + { + assertTrue(ts.IncrementToken()); + assertEquals(inflectionForm, inflectionAtt.GetInflectionForm()); + } + assertFalse(ts.IncrementToken()); + ts.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(ts); + } + } + + private void assertPartsOfSpeech(String input, params String[] partsOfSpeech) + { + TokenStream ts = analyzer.GetTokenStream("ignored", input); + try + { + IPartOfSpeechAttribute partOfSpeechAtt = ts.AddAttribute<IPartOfSpeechAttribute>(); + ts.Reset(); + foreach (String partOfSpeech in partsOfSpeech) + { + assertTrue(ts.IncrementToken()); + assertEquals(partOfSpeech, partOfSpeechAtt.GetPartOfSpeech()); + } + assertFalse(ts.IncrementToken()); + ts.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(ts); + } + } + + [Test] + public void TestReadings() + { + assertReadings("寿å¸ãé£ã¹ããã§ãã", + "ã¹ã·", + "ã¬", + "ã¿ã", + "ã¿ã¤", + "ãã¹", + "ã"); + } + + [Test] + public void TestReadings2() + { + assertReadings("å¤ãã®å¦çã試é¨ã«è½ã¡ãã", + "ãªãªã¯", + "ã", + "ã¬ã¯ã»ã¤", + "ã¬", + "ã·ã±ã³", + "ã", + "ãªã", + "ã¿", + "ã"); + } + + [Test] + public void TestPronunciations() + { + assertPronunciations("寿å¸ãé£ã¹ããã§ãã", + "ã¹ã·", + "ã¬", + "ã¿ã", + "ã¿ã¤", + "ãã¹", + "ã"); + } + + [Test] + public void TestPronunciations2() + { + // pronunciation differs from reading here + assertPronunciations("å¤ãã®å¦çã試é¨ã«è½ã¡ãã", + "ãªã¼ã¯", + "ã", + "ã¬ã¯ã»ã¤", + "ã¬", + "ã·ã±ã³", + "ã", + "ãªã", + "ã¿", + "ã"); + } + + [Test] + public void TestBasicForms() + { + assertBaseForms("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã", + null, + null, + null, + null, + null, + null, + "ãã", + null, + null); + } + + [Test] + public void TestInflectionTypes() + { + assertInflectionTypes("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã", + null, + null, + null, + null, + null, + null, + "äºæ®µã»ã©è¡", + "ç¹æ®ã»ãã¹", + null); + } + + [Test] + public void TestInflectionForms() + { + assertInflectionForms("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã", + null, + null, + null, + null, + null, + null, + "é£ç¨å½¢", + "åºæ¬å½¢", + null); + } + + [Test] + public void TestPartOfSpeech() + { + assertPartsOfSpeech("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã", + "åè©-代åè©-ä¸è¬", + "å©è©-ä¿å©è©", + "å¯è©-å©è©é¡æ¥ç¶", + "åè©-ãµå¤æ¥ç¶", + "åè©-ä¸è¬", + "å©è©-æ ¼å©è©-ä¸è¬", + "åè©-èªç«", + "å©åè©", + "è¨å·-å¥ç¹"); + } + + // TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix? + // do we have a possibility to actually lookup the first and last word from dictionary? + [Test] + public void TestYabottai() + { + AssertAnalyzesTo(analyzer, "ãã¼ã£ãã", + new String[] { "ãã¼ã£ãã" }); + } + + [Test] + public void TestTsukitosha() + { + AssertAnalyzesTo(analyzer, "çªãéãã", + new String[] { "çªãéãã" }); + } + + [Test] + public void TestBocchan() + { + doTestBocchan(1); + } + + [Test, LongRunningTest]//@Nightly + [Ignore("This test takes a long time to run - do it manually")] + public void TestBocchanBig() + { + doTestBocchan(100); + } + + /* + public void testWikipedia() { + final FileInputStream fis = new FileInputStream("/q/lucene/jawiki-20120220-pages-articles.xml"); + final Reader r = new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8)); + + final long startTimeNS = System.nanoTime(); + boolean done = false; + long compoundCount = 0; + long nonCompoundCount = 0; + long netOffset = 0; + while (!done) { + final TokenStream ts = analyzer.tokenStream("ignored", r); + ts.reset(); + final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); + final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); + int count = 0; + while (true) { + if (!ts.incrementToken()) { + done = true; + break; + } + count++; + if (posIncAtt.getPositionIncrement() == 0) { + compoundCount++; + } else { + nonCompoundCount++; + if (nonCompoundCount % 1000000 == 0) { + System.out.println(String.format("%.2f msec [pos=%d, %d, %d]", + (System.nanoTime()-startTimeNS)/1000000.0, + netOffset + offsetAtt.startOffset(), + nonCompoundCount, + compoundCount)); + } + } + if (count == 100000000) { + System.out.println(" again..."); + break; + } + } + ts.end(); + netOffset += offsetAtt.endOffset(); + } + System.out.println("compoundCount=" + compoundCount + " nonCompoundCount=" + nonCompoundCount); + r.close(); + } + */ + + + private void doTestBocchan(int numIterations) + { + TextReader reader = new StreamReader( + this.GetType().getResourceAsStream("bocchan.utf-8"), Encoding.UTF8); + String line = reader.ReadLine(); + reader.Dispose(); + + if (VERBOSE) + { + Console.WriteLine("Test for Bocchan without pre-splitting sentences"); + } + + /* + if (numIterations > 1) { + // warmup + for (int i = 0; i < numIterations; i++) { + final TokenStream ts = analyzer.tokenStream("ignored", line); + ts.reset(); + while(ts.incrementToken()); + } + } + */ + + long totalStart = Environment.TickCount; + for (int i = 0; i < numIterations; i++) + { + TokenStream ts = analyzer.GetTokenStream("ignored", line); + try + { + ts.Reset(); + while (ts.IncrementToken()) ; + ts.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(ts); + } + } + String[] sentences = Regex.Split(line, "ã|ã"); + if (VERBOSE) + { + Console.WriteLine("Total time : " + (Environment.TickCount - totalStart)); + Console.WriteLine("Test for Bocchan with pre-splitting sentences (" + sentences.Length + " sentences)"); + } + totalStart = Environment.TickCount; + for (int i = 0; i < numIterations; i++) + { + foreach (String sentence in sentences) + { + TokenStream ts = analyzer.GetTokenStream("ignored", sentence); + try + { + ts.Reset(); + while (ts.IncrementToken()) ; + ts.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(ts); + } + } + } + if (VERBOSE) + { + Console.WriteLine("Total time : " + (Environment.TickCount - totalStart)); + } + } + + [Test] + public void TestWithPunctuation() + { + AssertAnalyzesTo(analyzerNoPunct, "ç¾½ç°ã空港", + new String[] { "ç¾½ç°", "空港" }, + new int[] { 1, 1 }); + } + + [Test] + public void TestCompoundOverPunctuation() + { + AssertAnalyzesToPositions(analyzerNoPunct, "dεε϶ϢÏÎÏ·Îͺ羽ç°", + new String[] { "d", "ε", "ε", "Ï¢ÏÎÏ·Îͺ", "ç¾½ç°" }, + new int[] { 1, 1, 1, 1, 1 }, + new int[] { 1, 1, 1, 1, 1 }); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizerFactory.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizerFactory.cs new file mode 100644 index 0000000..91fbf16 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizerFactory.cs @@ -0,0 +1,134 @@ +using Lucene.Net.Support; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Simple tests for <see cref="JapaneseTokenizerFactory"/> + /// </summary> + public class TestJapaneseTokenizerFactory : BaseTokenStreamTestCase + { + [Test] + public void TestSimple() + { + JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new Dictionary<String, String>()); + factory.Inform(new StringMockResourceLoader("")); + TokenStream ts = factory.Create(new StringReader("ããã¯æ¬ã§ã¯ãªã")); + AssertTokenStreamContents(ts, + new String[] { "ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã" }, + new int[] { 0, 2, 3, 4, 5, 6 }, + new int[] { 2, 3, 4, 5, 6, 8 } + ); + } + + /** + * Test that search mode is enabled and working by default + */ + [Test] + public void TestDefaults() + { + JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new Dictionary<String, String>()); + factory.Inform(new StringMockResourceLoader("")); + TokenStream ts = factory.Create(new StringReader("ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢")); + AssertTokenStreamContents(ts, + new String[] { "ã·ãã¢", "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢", "ã½ããã¦ã§ã¢", "ã¨ã³ã¸ãã¢" } + ); + } + + /** + * Test mode parameter: specifying normal mode + */ + [Test] + public void TestMode() + { + IDictionary<String, String> args = new Dictionary<String, String>(); + args.Put("mode", "normal"); + JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args); + factory.Inform(new StringMockResourceLoader("")); + TokenStream ts = factory.Create(new StringReader("ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢")); + AssertTokenStreamContents(ts, + new String[] { "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢" } + ); + } + + /** + * Test user dictionary + */ + [Test] + public void TestUserDict() + { + String userDict = + "# Custom segmentation for long entries\n" + + "æ¥æ¬çµæ¸æ°è,æ¥æ¬ çµæ¸ æ°è,ããã³ ã±ã¤ã¶ã¤ ã·ã³ãã³,ã«ã¹ã¿ã åè©\n" + + "é¢è¥¿å½é空港,é¢è¥¿ å½é 空港,ã«ã³ãµã¤ ã³ã¯ãµã¤ ã¯ã¦ã³ã¦,ãã¹ãåè©\n" + + "# Custom reading for sumo wrestler\n" + + "æéé¾,æéé¾,ã¢ãµã·ã§ã¦ãªã¥ã¦,ã«ã¹ã¿ã 人å\n"; + IDictionary<String, String> args = new Dictionary<String, String>(); + args.Put("userDictionary", "userdict.txt"); + JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args); + factory.Inform(new StringMockResourceLoader(userDict)); + TokenStream ts = factory.Create(new StringReader("é¢è¥¿å½é空港ã«è¡ã£ã")); + AssertTokenStreamContents(ts, + new String[] { "é¢è¥¿", "å½é", "空港", "ã«", "è¡ã£", "ã" } + ); + } + + /** + * Test preserving punctuation + */ + [Test] + public void TestPreservePunctuation() + { + IDictionary<String, String> args = new Dictionary<String, String>(); + args.Put("discardPunctuation", "false"); + JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args); + factory.Inform(new StringMockResourceLoader("")); + TokenStream ts = factory.Create( + new StringReader("ä»ãã«ã¦ã§ã¼ã«ãã¾ãããæ¥é±ã®é æ¥æ¬ã«æ»ãã¾ããæ¥½ãã¿ã«ãã¦ãã¾ãï¼ã寿å¸ãé£ã¹ãããªããã") + ); + AssertTokenStreamContents(ts, + new String[] { "ä»", "ãã«ã¦ã§ã¼", "ã«", "ã", "ã¾ã", "ã", "ã", + "æ¥é±", "ã®", "é ", "æ¥æ¬", "ã«", "æ»ã", "ã¾ã", "ã", + "楽ãã¿", "ã«", "ã", "ã¦", "ã", "ã¾ã", "ï¼", + "ã", "寿å¸", "ã", "é£ã¹", "ãã", "ãª", "ã", "ã", "ã" } + ); + } + + /** Test that bogus arguments result in exception */ + [Test] + public void TestBogusArguments() + { + try + { + new JapaneseTokenizerFactory(new Dictionary<String, String>() { + { "bogusArg", "bogusValue" } + }); + fail(); + } + catch (ArgumentException expected) + { + assertTrue(expected.Message.Contains("Unknown parameters")); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestSearchMode.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestSearchMode.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestSearchMode.cs new file mode 100644 index 0000000..bb9fdae --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestSearchMode.cs @@ -0,0 +1,92 @@ +using NUnit.Framework; +using System; +using System.IO; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class TestSearchMode : BaseTokenStreamTestCase + { + private readonly static String SEGMENTATION_FILENAME = "search-segmentation-tests.txt"; + private readonly Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizerMode.SEARCH); + return new TokenStreamComponents(tokenizer, tokenizer); + }); + + + /** Test search mode segmentation */ + [Test] + public void TestSearchSegmentation() + { + Stream @is = typeof(TestSearchMode).getResourceAsStream(SEGMENTATION_FILENAME); + if (@is == null) + { + throw new FileNotFoundException("Cannot find " + SEGMENTATION_FILENAME + " in test classpath"); + } + try + { + TextReader reader = new StreamReader(@is, Encoding.UTF8); + String line = null; + int lineNumber = 0; + + while ((line = reader.ReadLine()) != null) + { + lineNumber++; + // Remove comments + line = Regex.Replace(line, "#.*$", ""); + // Skip empty lines or comment lines + if (line.Trim() == string.Empty) + { + continue; + } + if (VERBOSE) + { + Console.WriteLine("Line no. " + lineNumber + ": " + line); + } + String[] fields = new Regex("\t").Split(line, 2); // Regex.Split(line, "\t", 2); + String sourceText = fields[0]; + String[] expectedTokens = Regex.Split(fields[1], "\\s+"); + int[] expectedPosIncrs = new int[expectedTokens.Length]; + int[] expectedPosLengths = new int[expectedTokens.Length]; + for (int tokIDX = 0; tokIDX < expectedTokens.Length; tokIDX++) + { + if (expectedTokens[tokIDX].EndsWith("/0", StringComparison.Ordinal)) + { + expectedTokens[tokIDX] = Regex.Replace(expectedTokens[tokIDX], "/0", ""); + expectedPosLengths[tokIDX] = expectedTokens.Length - 1; + } + else + { + expectedPosIncrs[tokIDX] = 1; + expectedPosLengths[tokIDX] = 1; + } + } + AssertAnalyzesTo(analyzer, sourceText, expectedTokens, expectedPosIncrs); + } + } + finally + { + @is.Dispose(); + } + } + } +}
