[03/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji + tests

nightowl888 Sun, 23 Jul 2017 10:37:18 -0700

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/Tools/UnknownDictionaryTest.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.Kuromoji/Tools/UnknownDictionaryTest.cs 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/Tools/UnknownDictionaryTest.cs
new file mode 100644
index 0000000..a4940f5
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/Tools/UnknownDictionaryTest.cs
@@ -0,0 +1,93 @@
+ï»¿using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Ja.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class UnknownDictionaryTest : LuceneTestCase
+    {
+        public static readonly string FILENAME = "unk-tokeninfo-dict.obj";
+
+        [Test]
+        public void TestPutCharacterCategory()
+        {
+            UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 
1024 * 1024);
+
+            try
+            {
+                unkDic.PutCharacterCategory(0, "DUMMY_NAME");
+                fail();
+            }
+#pragma warning disable 168
+            catch (Exception e)
+#pragma warning restore 168
+            {
+
+            }
+
+            try
+            {
+                unkDic.PutCharacterCategory(-1, "KATAKANA");
+                fail();
+            }
+#pragma warning disable 168
+            catch (Exception e)
+#pragma warning restore 168
+            {
+
+            }
+
+            unkDic.PutCharacterCategory(0, "DEFAULT");
+            unkDic.PutCharacterCategory(1, "GREEK");
+            unkDic.PutCharacterCategory(2, "HIRAGANA");
+            unkDic.PutCharacterCategory(3, "KATAKANA");
+            unkDic.PutCharacterCategory(4, "KANJI");
+        }
+
+        [Test]
+        public void TestPut()
+        {
+            UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 
1024 * 1024);
+            try
+            {
+                
unkDic.Put(CSVUtil.Parse("KANJI,1285,11426,åè©,ä¸è¬,*,*,*,*,*,*,*"));
+                fail();
+            }
+#pragma warning disable 168
+            catch (Exception e)
+#pragma warning restore 168
+            {
+
+            }
+
+            String entry1 = 
"ALPHA,1285,1285,13398,åè©,ä¸è¬,*,*,*,*,*,*,*";
+            String entry2 = 
"HIRAGANA,1285,1285,13069,åè©,ä¸è¬,*,*,*,*,*,*,*";
+            String entry3 = 
"KANJI,1285,1285,11426,åè©,ä¸è¬,*,*,*,*,*,*,*";
+
+            unkDic.PutCharacterCategory(0, "ALPHA");
+            unkDic.PutCharacterCategory(1, "HIRAGANA");
+            unkDic.PutCharacterCategory(2, "KANJI");
+
+            unkDic.Put(CSVUtil.Parse(entry1));
+            unkDic.Put(CSVUtil.Parse(entry2));
+            unkDic.Put(CSVUtil.Parse(entry3));
+        }
+    }
+}


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/Util/TestToStringUtil.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/Util/TestToStringUtil.cs 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/Util/TestToStringUtil.cs
new file mode 100644
index 0000000..2922b27
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/Util/TestToStringUtil.cs
@@ -0,0 +1,121 @@
+ï»¿using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Ja.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class TestToStringUtil : LuceneTestCase
+    {
+        [Test]
+        public void TestPOS()
+        {
+            assertEquals("noun-suffix-verbal", 
ToStringUtil.GetPOSTranslation("åè©-æ¥å°¾-ãµå¤æ¥ç¶"));
+        }
+
+        [Test]
+        public void TestHepburn()
+        {
+            assertEquals("majan", 
ToStringUtil.GetRomanization("ãã¼ã¸ã£ã³"));
+            assertEquals("uroncha", 
ToStringUtil.GetRomanization("ã¦ã¼ãã³ãã£"));
+            assertEquals("chahan", 
ToStringUtil.GetRomanization("ãã£ã¼ãã³"));
+            assertEquals("chashu", 
ToStringUtil.GetRomanization("ãã£ã¼ã·ã¥ã¼"));
+            assertEquals("shumai", 
ToStringUtil.GetRomanization("ã·ã¥ã¼ãã¤"));
+        }
+
+        // see http://en.wikipedia.org/wiki/Hepburn_romanization,
+        // but this isnt even thorough or really probably what we want!
+        [Test]
+        public void TestHepburnTable()
+        {
+            IDictionary<String, String> table = new Dictionary<String, 
String>() {
+                { "ã¢", "a" }, { "ã¤", "i" }, { "ã¦", "u" }, { "ã¨", "e" 
}, { "ãª", "o" },
+                { "ã«", "ka" }, { "ã", "ki" }, { "ã¯", "ku" }, { "ã±", 
"ke" }, { "ã³", "ko" },
+                { "ãµ", "sa" }, { "ã·", "shi" }, { "ã¹", "su" }, { "ã»", 
"se" }, { "ã½", "so" },
+                { "ã¿", "ta" }, { "ã", "chi" }, { "ã", "tsu" }, { "ã", 
"te" }, { "ã", "to" },
+                { "ã", "na" }, { "ã", "ni" }, { "ã", "nu" }, { "ã", 
"ne" }, { "ã", "no" },
+                { "ã", "ha" }, { "ã", "hi" }, { "ã", "fu" }, { "ã", 
"he" }, { "ã", "ho" },
+                { "ã", "ma" }, { "ã", "mi" }, { "ã ", "mu" }, { "ã¡", 
"me" }, { "ã¢", "mo" },
+                { "ã¤", "ya" }, { "ã¦", "yu" }, { "ã¨", "yo" },
+                { "ã©", "ra" }, { "ãª", "ri" }, { "ã«", "ru" }, { "ã¬", 
"re" }, { "ã", "ro" },
+                { "ã¯", "wa" }, { "ã°", "i" }, { "ã±", "e" }, { "ã²", "o" 
},
+                { "ã³", "n" },
+                { "ã¬", "ga" }, { "ã®", "gi" }, { "ã°", "gu" }, { "ã²", 
"ge" }, { "ã´", "go" },
+                { "ã¶", "za" }, { "ã¸", "ji" }, { "ãº", "zu" }, { "ã¼", 
"ze" }, { "ã¾", "zo" },
+                { "ã", "da" }, { "ã", "ji" }, { "ã", "zu" }, { "ã", 
"de" }, { "ã", "do" },
+                { "ã", "ba" }, { "ã", "bi" }, { "ã", "bu" }, { "ã", 
"be" }, { "ã", "bo" },
+                { "ã", "pa" }, { "ã", "pi" }, { "ã", "pu" }, { "ã", 
"pe" }, { "ã", "po" },
+
+                { "ãã£", "kya" }, { "ãã¥", "kyu" }, { "ãã§", "kyo" },
+                { "ã·ã£", "sha" }, { "ã·ã¥", "shu" }, { "ã·ã§", "sho" },
+                { "ãã£", "cha" }, { "ãã¥", "chu" }, { "ãã§", "cho" },
+                { "ãã£", "nya" }, { "ãã¥", "nyu" }, { "ãã§", "nyo" },
+                { "ãã£", "hya" }, { "ãã¥", "hyu" }, { "ãã§", "hyo" },
+                { "ãã£", "mya" }, { "ãã¥", "myu" }, { "ãã§", "myo" },
+                { "ãªã£", "rya" }, { "ãªã¥", "ryu" }, { "ãªã§", "ryo" },
+                { "ã®ã£", "gya" }, { "ã®ã¥", "gyu" }, { "ã®ã§", "gyo" },
+                { "ã¸ã£", "ja" }, { "ã¸ã¥", "ju" }, { "ã¸ã§", "jo" },
+                { "ãã£", "ja" }, { "ãã¥", "ju" }, { "ãã§", "jo" },
+                { "ãã£", "bya" }, { "ãã¥", "byu" }, { "ãã§", "byo" },
+                { "ãã£", "pya" }, { "ãã¥", "pyu" }, { "ãã§", "pyo" },
+
+                { "ã¤ã£", "yi" }, { "ã¤ã§", "ye" },
+                { "ã¦ã¡", "wa" }, { "ã¦ã£", "wi" }, { "ã¦ã¥", "wu" }, { 
"ã¦ã§", "we" }, { "ã¦ã©", "wo" },
+                { "ã¦ã¥", "wyu" },
+                // TODO: really should be vu
+                { "ã´ã¡", "va" }, { "ã´ã£", "vi" }, { "ã´", "v" }, { 
"ã´ã§", "ve" }, { "ã´ã©", "vo" },
+                { "ã´ã£", "vya" }, { "ã´ã¥", "vyu" }, { "ã´ã£ã§", "vye" 
}, { "ã´ã§", "vyo" },
+                { "ãã§", "kye" },
+                { "ã®ã§", "gye" },
+                { "ã¯ã¡", "kwa" }, { "ã¯ã£", "kwi" }, { "ã¯ã§", "kwe" }, 
{ "ã¯ã©", "kwo" },
+                { "ã¯ã®", "kwa" },
+                { "ã°ã¡", "gwa" }, { "ã°ã£", "gwi" }, { "ã°ã§", "gwe" }, 
{ "ã°ã©", "gwo" },
+                { "ã°ã®", "gwa" },
+                { "ã·ã§", "she" },
+                { "ã¸ã§", "je" },
+                { "ã¹ã£", "si" },
+                { "ãºã£", "zi" },
+                { "ãã§", "che" },
+                { "ãã¡", "tsa" }, { "ãã£", "tsi" }, { "ãã§", "tse" }, 
{ "ãã©", "tso" },
+                { "ãã¥", "tsyu" },
+                { "ãã£", "ti" }, { "ãã¥", "tu" },
+                { "ãã¥", "tyu" },
+                { "ãã£", "di" }, { "ãã¥", "du" },
+                { "ãã¥", "dyu" },
+                { "ãã§", "nye" },
+                { "ãã§", "hye" },
+                { "ãã§", "bye" },
+                { "ãã§", "pye" },
+                { "ãã¡", "fa" }, { "ãã£", "fi" }, { "ãã§", "fe" }, { 
"ãã©", "fo" },
+                { "ãã£", "fya" }, { "ãã¥", "fyu" }, { "ãã£ã§", "fye" 
}, { "ãã§", "fyo" },
+                { "ãã¥", "hu" },
+                { "ãã§", "mye" },
+                { "ãªã§", "rye" },
+                { "ã©ã", "la" }, { "ãªã", "li" }, { "ã«ã", "lu" }, { 
"ã¬ã", "le" }, { "ãã", "lo" },
+                { "ã·", "va" }, { "ã¸", "vi" }, { "ã¹", "ve" }, { "ãº", 
"vo" },
+            };
+
+            foreach (String s in table.Keys)
+            {
+                assertEquals(s, table[s], ToStringUtil.GetRomanization(s));
+            }
+        }
+    }
+}

[03/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji + tests

Reply via email to