[07/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji + tests

nightowl888 Sun, 23 Jul 2017 10:36:50 -0700

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Util/ToStringUtil.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Util/ToStringUtil.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Util/ToStringUtil.cs
new file mode 100644
index 0000000..95e2703
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Util/ToStringUtil.cs
@@ -0,0 +1,1401 @@
+ï»¿using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Ja.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Utility class for english translations of morphological data,
+    /// used only for debugging.
+    /// </summary>
+    public static class ToStringUtil
+    {
+        // a translation map for parts of speech, only used for reflectWith
+        private static readonly IDictionary<string, string> posTranslations = 
new Dictionary<string, string>(StringComparer.Ordinal)
+        {
+            { "åè©", "noun"},
+            { "åè©-ä¸è¬", "noun-common" },
+            { "åè©-åºæåè©", "noun-proper" },
+            { "åè©-åºæåè©-ä¸è¬", "noun-proper-misc" },
+            { "åè©-åºæåè©-äººå", "noun-proper-person" },
+            { "åè©-åºæåè©-äººå-ä¸è¬", "noun-proper-person-misc" },
+            { "åè©-åºæåè©-äººå-å§", "noun-proper-person-surname" },
+            { "åè©-åºæåè©-äººå-å", 
"noun-proper-person-given_name" },
+            { "åè©-åºæåè©-çµç¹", "noun-proper-organization" },
+            { "åè©-åºæåè©-å°å", "noun-proper-place" },
+            { "åè©-åºæåè©-å°å-ä¸è¬", "noun-proper-place-misc" },
+            { "åè©-åºæåè©-å°å-å½", "noun-proper-place-country" },
+            { "åè©-ä»£åè©", "noun-pronoun" },
+            { "åè©-ä»£åè©-ä¸è¬", "noun-pronoun-misc" },
+            { "åè©-ä»£åè©-ç¸®ç´", "noun-pronoun-contraction" },
+            { "åè©-å¯è©å¯è½", "noun-adverbial" },
+            { "åè©-ãµå¤æ¥ç¶", "noun-verbal" },
+            { "åè©-å½¢å®¹åè©èªå¹¹", "noun-adjective-base" },
+            { "åè©-æ°", "noun-numeric" },
+            { "åè©-éèªç«", "noun-affix" },
+            { "åè©-éèªç«-ä¸è¬", "noun-affix-misc" },
+            { "åè©-éèªç«-å¯è©å¯è½", "noun-affix-adverbial" },
+            { "åè©-éèªç«-å©åè©èªå¹¹", "noun-affix-aux" },
+            { "åè©-éèªç«-å½¢å®¹åè©èªå¹¹", 
"noun-affix-adjective-base" },
+            { "åè©-ç¹æ®", "noun-special" },
+            { "åè©-ç¹æ®-å©åè©èªå¹¹", "noun-special-aux" },
+            { "åè©-æ¥å°¾", "noun-suffix" },
+            { "åè©-æ¥å°¾-ä¸è¬", "noun-suffix-misc" },
+            { "åè©-æ¥å°¾-äººå", "noun-suffix-person" },
+            { "åè©-æ¥å°¾-å°å", "noun-suffix-place" },
+            { "åè©-æ¥å°¾-ãµå¤æ¥ç¶", "noun-suffix-verbal" },
+            { "åè©-æ¥å°¾-å©åè©èªå¹¹", "noun-suffix-aux" },
+            { "åè©-æ¥å°¾-å½¢å®¹åè©èªå¹¹", "noun-suffix-adjective-base" 
},
+            { "åè©-æ¥å°¾-å¯è©å¯è½", "noun-suffix-adverbial" },
+            { "åè©-æ¥å°¾-å©æ°è©", "noun-suffix-classifier" },
+            { "åè©-æ¥å°¾-ç¹æ®", "noun-suffix-special" },
+            { "åè©-æ¥ç¶è©ç", "noun-suffix-conjunctive" },
+            { "åè©-åè©éèªç«ç", "noun-verbal_aux" },
+            { "åè©-å¼ç¨æåå", "noun-quotation" },
+            { "åè©-ãã¤å½¢å®¹è©èªå¹¹", "noun-nai_adjective" },
+            { "æ¥é è©", "prefix" },
+            { "æ¥é è©-åè©æ¥ç¶", "prefix-nominal" },
+            { "æ¥é è©-åè©æ¥ç¶", "prefix-verbal" },
+            { "æ¥é è©-å½¢å®¹è©æ¥ç¶", "prefix-adjectival" },
+            { "æ¥é è©-æ°æ¥ç¶", "prefix-numerical" },
+            { "åè©", "verb" },
+            { "åè©-èªç«", "verb-main" },
+            { "åè©-éèªç«", "verb-auxiliary" },
+            { "åè©-æ¥å°¾", "verb-suffix" },
+            { "å½¢å®¹è©", "adjective" },
+            { "å½¢å®¹è©-èªç«", "adjective-main" },
+            { "å½¢å®¹è©-éèªç«", "adjective-auxiliary" },
+            { "å½¢å®¹è©-æ¥å°¾", "adjective-suffix" },
+            { "å¯è©", "adverb" },
+            { "å¯è©-ä¸è¬", "adverb-misc" },
+            { "å¯è©-å©è©é¡æ¥ç¶", "adverb-particle_conjunction" },
+            { "é£ä½è©", "adnominal" },
+            { "æ¥ç¶è©", "conjunction" },
+            { "å©è©", "particle" },
+            { "å©è©-æ ¼å©è©", "particle-case" },
+            { "å©è©-æ ¼å©è©-ä¸è¬", "particle-case-misc" },
+            { "å©è©-æ ¼å©è©-å¼ç¨", "particle-case-quote" },
+            { "å©è©-æ ¼å©è©-é£èª", "particle-case-compound" },
+            { "å©è©-æ¥ç¶å©è©", "particle-conjunctive" },
+            { "å©è©-ä¿å©è©", "particle-dependency" },
+            { "å©è©-å¯å©è©", "particle-adverbial" },
+            { "å©è©-éæå©è©", "particle-interjective" },
+            { "å©è©-ä¸¦ç«å©è©", "particle-coordinate" },
+            { "å©è©-çµå©è©", "particle-final" },
+            { "å©è©-å¯å©è©ï¼ä¸¦ç«å©è©ï¼çµå©è©", 
"particle-adverbial/conjunctive/final" },
+            { "å©è©-é£ä½å", "particle-adnominalizer" },
+            { "å©è©-å¯è©å", "particle-adnominalizer" },
+            { "å©è©-ç¹æ®", "particle-special" },
+            { "å©åè©", "auxiliary-verb" },
+            { "æåè©", "interjection" },
+            { "è¨å·", "symbol" },
+            { "è¨å·-ä¸è¬", "symbol-misc" },
+            { "è¨å·-å¥ç¹", "symbol-period" },
+            { "è¨å·-èªç¹", "symbol-comma" },
+            { "è¨å·-ç©ºç½", "symbol-space" },
+            { "è¨å·-æ¬å¼§é", "symbol-open_bracket" },
+            { "è¨å·-æ¬å¼§é", "symbol-close_bracket" },
+            { "è¨å·-ã¢ã«ãã¡ããã", "symbol-alphabetic" },
+            { "ãã®ä»", "other" },
+            { "ãã®ä»-éæ", "other-interjection" },
+            { "ãã£ã©ã¼", "filler" },
+            { "éè¨èªé³", "non-verbal" },
+            { "èªæç", "fragment" },
+            { "æªç¥èª", "unknown" }
+        };
+
+
+        /// <summary>
+        /// Get the english form of a POS tag
+        /// </summary>
+        public static string GetPOSTranslation(string s)
+        {
+            string result;
+            posTranslations.TryGetValue(s, out result);
+            return result;
+        }
+
+        // a translation map for inflection types, only used for reflectWith
+        private static readonly IDictionary<string, string> 
inflTypeTranslations = new Dictionary<string, string>(StringComparer.Ordinal)
+        {
+            { "*", "*" },
+            { "å½¢å®¹è©ã»ã¢ã¦ãªæ®µ", "adj-group-a-o-u" },
+            { "å½¢å®¹è©ã»ã¤æ®µ", "adj-group-i" },
+            { "å½¢å®¹è©ã»ã¤ã¤",  "adj-group-ii" },
+            { "ä¸å¤åå", "non-inflectional" },
+            { "ç¹æ®ã»ã¿", "special-da" },
+            { "ç¹æ®ã»ã", "special-ta" },
+            { "æèªã»ã´ãã·", "classical-gotoshi" },
+            { "ç¹æ®ã»ã¸ã£", "special-ja" },
+            { "ç¹æ®ã»ãã¤", "special-nai" },
+            { "äºæ®µã»ã©è¡ç¹æ®", "5-row-cons-r-special" },
+            { "ç¹æ®ã»ã", "special-nu" },
+            { "æèªã»ã", "classical-ki" },
+            { "ç¹æ®ã»ã¿ã¤", "special-tai" },
+            { "æèªã»ãã·", "classical-beshi" },
+            { "ç¹æ®ã»ã¤", "special-ya" },
+            { "æèªã»ãã¸", "classical-maji" },
+            { "ä¸äºã»ã¿è¡", "2-row-lower-cons-t" },
+            { "ç¹æ®ã»ãã¹", "special-desu" },
+            { "ç¹æ®ã»ãã¹", "special-masu" },
+            { "äºæ®µã»ã©è¡ã¢ã«", "5-row-aru" },
+            { "æèªã»ããª", "classical-nari" },
+            { "æèªã»ãª", "classical-ri" },
+            { "æèªã»ã±ãª", "classical-keri" },
+            { "æèªã»ã«", "classical-ru" },
+            { "äºæ®µã»ã«è¡ã¤é³ä¾¿", "5-row-cons-k-i-onbin" },
+            { "äºæ®µã»ãµè¡", "5-row-cons-s" },
+            { "ä¸æ®µ", "1-row" },
+            { "äºæ®µã»ã¯è¡ä¿é³ä¾¿", "5-row-cons-w-cons-onbin" },
+            { "äºæ®µã»ãè¡", "5-row-cons-m" },
+            { "äºæ®µã»ã¿è¡", "5-row-cons-t" },
+            { "äºæ®µã»ã©è¡", "5-row-cons-r" },
+            { "ãµå¤ã»âã¹ã«", "irregular-suffix-suru" },
+            { "äºæ®µã»ã¬è¡", "5-row-cons-g" },
+            { "ãµå¤ã»âãºã«", "irregular-suffix-zuru" },
+            { "äºæ®µã»ãè¡", "5-row-cons-b" },
+            { "äºæ®µã»ã¯è¡ã¦é³ä¾¿", "5-row-cons-w-u-onbin" },
+            { "ä¸äºã»ãè¡", "2-row-lower-cons-d" },
+            { "äºæ®µã»ã«è¡ä¿é³ä¾¿ã¦ã¯", "5-row-cons-k-cons-onbin-yuku" 
},
+            { "ä¸äºã»ãè¡", "2-row-upper-cons-d" },
+            { "äºæ®µã»ã«è¡ä¿é³ä¾¿", "5-row-cons-k-cons-onbin" },
+            { "ä¸æ®µã»å¾ã«", "1-row-eru" },
+            { "åæ®µã»ã¿è¡", "4-row-cons-t" },
+            { "äºæ®µã»ãè¡", "5-row-cons-n" },
+            { "ä¸äºã»ãè¡", "2-row-lower-cons-h" },
+            { "åæ®µã»ãè¡", "4-row-cons-h" },
+            { "åæ®µã»ãè¡", "4-row-cons-b" },
+            { "ãµå¤ã»ã¹ã«", "irregular-suru" },
+            { "ä¸äºã»ãè¡", "2-row-upper-cons-h" },
+            { "ä¸äºã»ãè¡", "2-row-lower-cons-m" },
+            { "åæ®µã»ãµè¡", "4-row-cons-s" },
+            { "ä¸äºã»ã¬è¡", "2-row-lower-cons-g" },
+            { "ã«å¤ã»æ¥ã«", "kuru-kanji" },
+            { "ä¸æ®µã»ã¯ã¬ã«", "1-row-kureru" },
+            { "ä¸äºã»å¾", "2-row-lower-u" },
+            { "ã«å¤ã»ã¯ã«", "kuru-kana" },
+            { "ã©å¤", "irregular-cons-r" },
+            { "ä¸äºã»ã«è¡", "2-row-lower-cons-k" },
+        };
+
+
+        /// <summary>
+        /// Get the english form of inflection type
+        /// </summary>
+        public static string GetInflectionTypeTranslation(string s)
+        {
+            string result;
+            inflTypeTranslations.TryGetValue(s, out result);
+            return result;
+        }
+
+        // a translation map for inflection forms, only used for reflectWith
+        private static readonly IDictionary<string, string> 
inflFormTranslations = new Dictionary<string, string>(StringComparer.Ordinal)
+        {
+            { "*", "*" },
+            { "åºæ¬å½¢", "base" },
+            { "æèªåºæ¬å½¢", "classical-base" },
+            { "æªç¶ãæ¥ç¶", "imperfective-nu-connection" },
+            { "æªç¶ã¦æ¥ç¶", "imperfective-u-connection" },
+            { "é£ç¨ã¿æ¥ç¶", "conjunctive-ta-connection" },
+            { "é£ç¨ãæ¥ç¶", "conjunctive-te-connection" },
+            { "é£ç¨ã´ã¶ã¤æ¥ç¶", "conjunctive-gozai-connection" },
+            { "ä½è¨æ¥ç¶", "uninflected-connection" },
+            { "ä»®å®å½¢", "subjunctive" },
+            { "å½ä»¤ï½", "imperative-e" },
+            { "ä»®å®ç¸®ç´ï¼", "conditional-contracted-1" },
+            { "ä»®å®ç¸®ç´ï¼", "conditional-contracted-2" },
+            { "ã¬ã«æ¥ç¶", "garu-connection" },
+            { "æªç¶å½¢", "imperfective" },
+            { "é£ç¨å½¢", "conjunctive" },
+            { "é³ä¾¿åºæ¬å½¢", "onbin-base" },
+            { "é£ç¨ãæ¥ç¶", "conjunctive-de-connection" },
+            { "æªç¶ç¹æ®", "imperfective-special" },
+            { "å½ä»¤ï½", "imperative-i" },
+            { "é£ç¨ãæ¥ç¶", "conjunctive-ni-connection" },
+            { "å½ä»¤ï½ï½", "imperative-yo" },
+            { "ä½è¨æ¥ç¶ç¹æ®", "adnominal-special" },
+            { "å½ä»¤ï½ï½", "imperative-ro" },
+            { "ä½è¨æ¥ç¶ç¹æ®ï¼", "uninflected-special-connection-2" },
+            { "æªç¶ã¬ã«æ¥ç¶", "imperfective-reru-connection" },
+            { "ç¾ä»£åºæ¬å½¢", "modern-base" },
+            { "åºæ¬å½¢-ä¿é³ä¾¿", "base-onbin" }, // not sure about this
+        };
+
+
+        /// <summary>
+        /// Get the english form of inflected form
+        /// </summary>
+        public static string GetInflectedFormTranslation(string s)
+        {
+            string result;
+            inflFormTranslations.TryGetValue(s, out result);
+            return result;
+        }
+
+        /// <summary>
+        /// Romanize katakana with modified hepburn
+        /// </summary>
+        public static string GetRomanization(string s)
+        {
+            StringBuilder result = new StringBuilder();
+            try
+            {
+                GetRomanization(result, s);
+            }
+            catch (IOException bogus)
+            {
+                throw new Exception(bogus.ToString(), bogus);
+            }
+            return result.ToString();
+        }
+
+        /// <summary>
+        /// Romanize katakana with modified hepburn
+        /// </summary>
+        // TODO: now that this is used by readingsfilter and not just for
+        // debugging, fix this to really be a scheme that works best with IMEs
+        public static void GetRomanization(StringBuilder builder, string s)
+        {
+            int len = s.Length;
+            for (int i = 0; i < len; i++)
+            {
+                // maximum lookahead: 3
+                char ch = s[i];
+                char ch2 = (i < len - 1) ? s[i + 1] : (char)0;
+                char ch3 = (i < len - 2) ? s[i + 2] : (char)0;
+
+                //main:
+                switch (ch)
+                {
+
+                    case 'ã':
+                        switch (ch2)
+                        {
+                            case 'ã«':
+                            case 'ã':
+                            case 'ã¯':
+                            case 'ã±':
+                            case 'ã³':
+                                builder.Append('k');
+                                goto break_main;
+                            case 'ãµ':
+                            case 'ã·':
+                            case 'ã¹':
+                            case 'ã»':
+                            case 'ã½':
+                                builder.Append('s');
+                                goto break_main;
+                            case 'ã¿':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                                builder.Append('t');
+                                goto break_main;
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                                builder.Append('p');
+                                goto break_main;
+                        }
+                        break;
+                    case 'ã¢':
+                        builder.Append('a');
+                        break;
+                    case 'ã¤':
+                        if (ch2 == 'ã£')
+                        {
+                            builder.Append("yi");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("ye");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append('i');
+                        }
+                        break;
+                    case 'ã¦':
+                        switch (ch2)
+                        {
+                            case 'ã¡':
+                                builder.Append("wa");
+                                i++;
+                                break;
+                            case 'ã£':
+                                builder.Append("wi");
+                                i++;
+                                break;
+                            case 'ã¥':
+                                builder.Append("wu");
+                                i++;
+                                break;
+                            case 'ã§':
+                                builder.Append("we");
+                                i++;
+                                break;
+                            case 'ã©':
+                                builder.Append("wo");
+                                i++;
+                                break;
+                            case 'ã¥':
+                                builder.Append("wyu");
+                                i++;
+                                break;
+                            default:
+                                builder.Append('u');
+                                break;
+                        }
+                        break;
+                    case 'ã¨':
+                        builder.Append('e');
+                        break;
+                    case 'ãª':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append('Å');
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append('o');
+                        }
+                        break;
+                    case 'ã«':
+                        builder.Append("ka");
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã§' && ch3 == 'ã¦')
+                        {
+                            builder.Append("kyÅ");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã¥' && ch3 == 'ã¦')
+                        {
+                            builder.Append("kyÅ«");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("kya");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("kyo");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("kyu");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("kye");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("ki");
+                        }
+                        break;
+                    case 'ã¯':
+                        switch (ch2)
+                        {
+                            case 'ã¡':
+                                builder.Append("kwa");
+                                i++;
+                                break;
+                            case 'ã£':
+                                builder.Append("kwi");
+                                i++;
+                                break;
+                            case 'ã§':
+                                builder.Append("kwe");
+                                i++;
+                                break;
+                            case 'ã©':
+                                builder.Append("kwo");
+                                i++;
+                                break;
+                            case 'ã®':
+                                builder.Append("kwa");
+                                i++;
+                                break;
+                            default:
+                                builder.Append("ku");
+                                break;
+                        }
+                        break;
+                    case 'ã±':
+                        builder.Append("ke");
+                        break;
+                    case 'ã³':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append("kÅ");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("ko");
+                        }
+                        break;
+                    case 'ãµ':
+                        builder.Append("sa");
+                        break;
+                    case 'ã·':
+                        if (ch2 == 'ã§' && ch3 == 'ã¦')
+                        {
+                            builder.Append("shÅ");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã¥' && ch3 == 'ã¦')
+                        {
+                            builder.Append("shÅ«");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("sha");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("sho");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("shu");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("she");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("shi");
+                        }
+                        break;
+                    case 'ã¹':
+                        if (ch2 == 'ã£')
+                        {
+                            builder.Append("si");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("su");
+                        }
+                        break;
+                    case 'ã»':
+                        builder.Append("se");
+                        break;
+                    case 'ã½':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append("sÅ");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("so");
+                        }
+                        break;
+                    case 'ã¿':
+                        builder.Append("ta");
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã§' && ch3 == 'ã¦')
+                        {
+                            builder.Append("chÅ");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã¥' && ch3 == 'ã¦')
+                        {
+                            builder.Append("chÅ«");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("cha");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("cho");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("chu");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("che");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("chi");
+                        }
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã¡')
+                        {
+                            builder.Append("tsa");
+                            i++;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("tsi");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("tse");
+                            i++;
+                        }
+                        else if (ch2 == 'ã©')
+                        {
+                            builder.Append("tso");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("tsyu");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("tsu");
+                        }
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã£')
+                        {
+                            builder.Append("ti");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("tu");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("tyu");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("te");
+                        }
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append("tÅ");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("tu");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("to");
+                        }
+                        break;
+                    case 'ã':
+                        builder.Append("na");
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã§' && ch3 == 'ã¦')
+                        {
+                            builder.Append("nyÅ");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã¥' && ch3 == 'ã¦')
+                        {
+                            builder.Append("nyÅ«");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("nya");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("nyo");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("nyu");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("nye");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("ni");
+                        }
+                        break;
+                    case 'ã':
+                        builder.Append("nu");
+                        break;
+                    case 'ã':
+                        builder.Append("ne");
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append("nÅ");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("no");
+                        }
+                        break;
+                    case 'ã':
+                        builder.Append("ha");
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã§' && ch3 == 'ã¦')
+                        {
+                            builder.Append("hyÅ");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã¥' && ch3 == 'ã¦')
+                        {
+                            builder.Append("hyÅ«");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("hya");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("hyo");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("hyu");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("hye");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("hi");
+                        }
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã£')
+                        {
+                            builder.Append("fya");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("fyu");
+                            i++;
+                        }
+                        else if (ch2 == 'ã£' && ch3 == 'ã§')
+                        {
+                            builder.Append("fye");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("fyo");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¡')
+                        {
+                            builder.Append("fa");
+                            i++;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("fi");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("fe");
+                            i++;
+                        }
+                        else if (ch2 == 'ã©')
+                        {
+                            builder.Append("fo");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("fu");
+                        }
+                        break;
+                    case 'ã':
+                        builder.Append("he");
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append("hÅ");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("hu");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("ho");
+                        }
+                        break;
+                    case 'ã':
+                        builder.Append("ma");
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã§' && ch3 == 'ã¦')
+                        {
+                            builder.Append("myÅ");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã¥' && ch3 == 'ã¦')
+                        {
+                            builder.Append("myÅ«");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("mya");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("myo");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("myu");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("mye");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("mi");
+                        }
+                        break;
+                    case 'ã ':
+                        builder.Append("mu");
+                        break;
+                    case 'ã¡':
+                        builder.Append("me");
+                        break;
+                    case 'ã¢':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append("mÅ");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("mo");
+                        }
+                        break;
+                    case 'ã¤':
+                        builder.Append("ya");
+                        break;
+                    case 'ã¦':
+                        builder.Append("yu");
+                        break;
+                    case 'ã¨':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append("yÅ");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("yo");
+                        }
+                        break;
+                    case 'ã©':
+                        if (ch2 == 'ã')
+                        {
+                            builder.Append("la");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("ra");
+                        }
+                        break;
+                    case 'ãª':
+                        if (ch2 == 'ã§' && ch3 == 'ã¦')
+                        {
+                            builder.Append("ryÅ");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã¥' && ch3 == 'ã¦')
+                        {
+                            builder.Append("ryÅ«");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("rya");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("ryo");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("ryu");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("rye");
+                            i++;
+                        }
+                        else if (ch2 == 'ã')
+                        {
+                            builder.Append("li");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("ri");
+                        }
+                        break;
+                    case 'ã«':
+                        if (ch2 == 'ã')
+                        {
+                            builder.Append("lu");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("ru");
+                        }
+                        break;
+                    case 'ã¬':
+                        if (ch2 == 'ã')
+                        {
+                            builder.Append("le");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("re");
+                        }
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append("rÅ");
+                            i++;
+                        }
+                        else if (ch2 == 'ã')
+                        {
+                            builder.Append("lo");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("ro");
+                        }
+                        break;
+                    case 'ã¯':
+                        builder.Append("wa");
+                        break;
+                    case 'ã°':
+                        builder.Append("i");
+                        break;
+                    case 'ã±':
+                        builder.Append("e");
+                        break;
+                    case 'ã²':
+                        builder.Append("o");
+                        break;
+                    case 'ã³':
+                        switch (ch2)
+                        {
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã':
+                            case 'ã ':
+                            case 'ã¡':
+                            case 'ã¢':
+                                builder.Append('m');
+                                goto break_main;
+                            case 'ã¤':
+                            case 'ã¦':
+                            case 'ã¨':
+                            case 'ã¢':
+                            case 'ã¤':
+                            case 'ã¦':
+                            case 'ã¨':
+                            case 'ãª':
+                                builder.Append("n'");
+                                goto break_main;
+                            default:
+                                builder.Append("n");
+                                goto break_main;
+                        }
+                    case 'ã¬':
+                        builder.Append("ga");
+                        break;
+                    case 'ã®':
+                        if (ch2 == 'ã§' && ch3 == 'ã¦')
+                        {
+                            builder.Append("gyÅ");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã¥' && ch3 == 'ã¦')
+                        {
+                            builder.Append("gyÅ«");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("gya");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("gyo");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("gyu");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("gye");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("gi");
+                        }
+                        break;
+                    case 'ã°':
+                        switch (ch2)
+                        {
+                            case 'ã¡':
+                                builder.Append("gwa");
+                                i++;
+                                break;
+                            case 'ã£':
+                                builder.Append("gwi");
+                                i++;
+                                break;
+                            case 'ã§':
+                                builder.Append("gwe");
+                                i++;
+                                break;
+                            case 'ã©':
+                                builder.Append("gwo");
+                                i++;
+                                break;
+                            case 'ã®':
+                                builder.Append("gwa");
+                                i++;
+                                break;
+                            default:
+                                builder.Append("gu");
+                                break;
+                        }
+                        break;
+                    case 'ã²':
+                        builder.Append("ge");
+                        break;
+                    case 'ã´':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append("gÅ");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("go");
+                        }
+                        break;
+                    case 'ã¶':
+                        builder.Append("za");
+                        break;
+                    case 'ã¸':
+                        if (ch2 == 'ã§' && ch3 == 'ã¦')
+                        {
+                            builder.Append("jÅ");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã¥' && ch3 == 'ã¦')
+                        {
+                            builder.Append("jÅ«");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("ja");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("jo");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("ju");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("je");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("ji");
+                        }
+                        break;
+                    case 'ãº':
+                        if (ch2 == 'ã£')
+                        {
+                            builder.Append("zi");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("zu");
+                        }
+                        break;
+                    case 'ã¼':
+                        builder.Append("ze");
+                        break;
+                    case 'ã¾':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append("zÅ");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("zo");
+                        }
+                        break;
+                    case 'ã':
+                        builder.Append("da");
+                        break;
+                    case 'ã':
+                        // TODO: investigate all this
+                        if (ch2 == 'ã§' && ch3 == 'ã¦')
+                        {
+                            builder.Append("jÅ");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã¥' && ch3 == 'ã¦')
+                        {
+                            builder.Append("jÅ«");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("ja");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("jo");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("ju");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("je");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("ji");
+                        }
+                        break;
+                    case 'ã':
+                        builder.Append("zu");
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã£')
+                        {
+                            builder.Append("di");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("dyu");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("de");
+                        }
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append("dÅ");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("du");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("do");
+                        }
+                        break;
+                    case 'ã':
+                        builder.Append("ba");
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã§' && ch3 == 'ã¦')
+                        {
+                            builder.Append("byÅ");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã¥' && ch3 == 'ã¦')
+                        {
+                            builder.Append("byÅ«");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("bya");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("byo");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("byu");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("bye");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("bi");
+                        }
+                        break;
+                    case 'ã':
+                        builder.Append("bu");
+                        break;
+                    case 'ã':
+                        builder.Append("be");
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append("bÅ");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("bo");
+                        }
+                        break;
+                    case 'ã':
+                        builder.Append("pa");
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã§' && ch3 == 'ã¦')
+                        {
+                            builder.Append("pyÅ");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã¥' && ch3 == 'ã¦')
+                        {
+                            builder.Append("pyÅ«");
+                            i += 2;
+                        }
+                        else if (ch2 == 'ã£')
+                        {
+                            builder.Append("pya");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("pyo");
+                            i++;
+                        }
+                        else if (ch2 == 'ã¥')
+                        {
+                            builder.Append("pyu");
+                            i++;
+                        }
+                        else if (ch2 == 'ã§')
+                        {
+                            builder.Append("pye");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("pi");
+                        }
+                        break;
+                    case 'ã':
+                        builder.Append("pu");
+                        break;
+                    case 'ã':
+                        builder.Append("pe");
+                        break;
+                    case 'ã':
+                        if (ch2 == 'ã¦')
+                        {
+                            builder.Append("pÅ");
+                            i++;
+                        }
+                        else
+                        {
+                            builder.Append("po");
+                        }
+                        break;
+                    case 'ã·':
+                        builder.Append("va");
+                        break;
+                    case 'ã¸':
+                        builder.Append("vi");
+                        break;
+                    case 'ã¹':
+                        builder.Append("ve");
+                        break;
+                    case 'ãº':
+                        builder.Append("vo");
+                        break;
+                    case 'ã´':
+                        if (ch2 == 'ã£' && ch3 == 'ã§')
+                        {
+                            builder.Append("vye");
+                            i += 2;
+                        }
+                        else
+                        {
+                            builder.Append('v');
+                        }
+                        break;
+                    case 'ã¡':
+                        builder.Append('a');
+                        break;
+                    case 'ã£':
+                        builder.Append('i');
+                        break;
+                    case 'ã¥':
+                        builder.Append('u');
+                        break;
+                    case 'ã§':
+                        builder.Append('e');
+                        break;
+                    case 'ã©':
+                        builder.Append('o');
+                        break;
+                    case 'ã®':
+                        builder.Append("wa");
+                        break;
+                    case 'ã£':
+                        builder.Append("ya");
+                        break;
+                    case 'ã¥':
+                        builder.Append("yu");
+                        break;
+                    case 'ã§':
+                        builder.Append("yo");
+                        break;
+                    case 'ã¼':
+                        break;
+                    default:
+                        builder.Append(ch);
+                        break;
+                }
+                break_main: { }
+            }
+        }
+    }
+}


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/project.json
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/project.json 
b/src/Lucene.Net.Analysis.Kuromoji/project.json
new file mode 100644
index 0000000..937b9bf
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/project.json
@@ -0,0 +1,60 @@
+{
+  "version": "4.8.0",
+  "title": "Lucene.Net.Analysis.Kuromoji",
+  "description": "Japanese Morphological Analyzer for the Lucene.Net full-text 
search engine library from The Apache Software Foundation.",
+  "authors": [ "The Apache Software Foundation" ],
+  "packOptions": {
+    "projectUrl": "http://lucenenet.apache.org/";,
+    "licenseUrl": 
"https://github.com/apache/lucenenet/blob/master/LICENSE.txt";,
+    "iconUrl": 
"https://github.com/apache/lucenenet/blob/master/branding/logo/lucene-net-icon-128x128.png?raw=true";,
+    "owners": [ "The Apache Software Foundation" ],
+    "repository": { "url": "https://github.com/apache/lucenenet"; },
+    "tags": [ "lucene.net", "core", "text", "search", "information", 
"retrieval", "lucene", "apache", "analysis", "index", "query", "japanese" ],
+    "releaseNotes": "This package depends on a temporary version of icu.net 
hosted on MyGet until official .NET Core support is added. To install, copy the 
NuGet.config file from 
https://github.com/apache/lucenenet/blob/master/NuGet.config into your project 
and then install this package via Package Manager Console as usual."
+  },
+  "buildOptions": {
+    "compile": {
+      "includeFiles": [ "../CommonAssemblyInfo.cs" ]
+    },
+    "embed": {
+      "includeFiles": [
+        "stoptags.txt",
+        "stopwords.txt",
+        "Dict/CharacterDefinition.dat",
+        "Dict/ConnectionCosts.dat",
+        "Dict/TokenInfoDictionary$buffer.dat",
+        "Dict/TokenInfoDictionary$fst.dat",
+        "Dict/TokenInfoDictionary$posDict.dat",
+        "Dict/TokenInfoDictionary$targetMap.dat",
+        "Dict/UnknownDictionary$buffer.dat",
+        "Dict/UnknownDictionary$posDict.dat",
+        "Dict/UnknownDictionary$targetMap.dat"
+      ]
+    },
+    "nowarn": [ "1591", "1573" ]
+  },
+  "dependencies": {
+    "Lucene.Net": "4.8.0",
+    "Lucene.Net.Analysis.Common": "4.8.0"
+  },
+  "frameworks": {
+    "netstandard1.5": {
+      "imports": "dnxcore50",
+      "buildOptions": {
+        "debugType": "portable",
+        "define": [ "NETSTANDARD" ]
+      },
+      "dependencies": {
+        "NETStandard.Library": "1.6.0",
+        "System.Globalization.Extensions": "4.3.0",
+        "System.Text.Encoding.CodePages": "4.4.0-preview1-25305-02"
+      }
+    },
+    "net451": {
+      "buildOptions": {
+        "debugType": "full",
+        "define": [ "FEATURE_SERIALIZABLE" ]
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/stoptags.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/stoptags.txt 
b/src/Lucene.Net.Analysis.Kuromoji/stoptags.txt
new file mode 100644
index 0000000..71b7508
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/stoptags.txt
@@ -0,0 +1,420 @@
+#
+# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
+#
+# Any token with a part-of-speech tag that exactly matches those defined in 
this
+# file are removed from the token stream.
+#
+# Set your own stoptags by uncommenting the lines below.  Note that comments 
are
+# not allowed on the same line as a stoptag.  See LUCENE-3745 for frequency 
lists,
+# etc. that can be useful for building you own stoptag set.
+#
+# The entire possible tagset is provided below for convenience.
+#
+#####
+#  noun: unclassified nouns
+#åè©
+#
+#  noun-common: Common nouns or nouns where the sub-classification is undefined
+#åè©-ä¸è¬
+#
+#  noun-proper: Proper nouns where the sub-classification is undefined 
+#åè©-åºæåè©
+#
+#  noun-proper-misc: miscellaneous proper nouns
+#åè©-åºæåè©-ä¸è¬
+#
+#  noun-proper-person: Personal names where the sub-classification is undefined
+#åè©-åºæåè©-äººå
+#
+#  noun-proper-person-misc: names that cannot be divided into surname and 
+#  given name; foreign names; names where the surname or given name is unknown.
+#  e.g. ãå¸ã®æ¹
+#åè©-åºæåè©-äººå-ä¸è¬
+#
+#  noun-proper-person-surname: Mainly Japanese surnames.
+#  e.g. å±±ç°
+#åè©-åºæåè©-äººå-å§
+#
+#  noun-proper-person-given_name: Mainly Japanese given names.
+#  e.g. å¤ªé
+#åè©-åºæåè©-äººå-å
+#
+#  noun-proper-organization: Names representing organizations.
+#  e.g. éç£ç, NHK
+#åè©-åºæåè©-çµç¹
+#
+#  noun-proper-place: Place names where the sub-classification is undefined
+#åè©-åºæåè©-å°å
+#
+#  noun-proper-place-misc: Place names excluding countries.
+#  e.g. ã¢ã¸ã¢, ãã«ã»ãã, äº¬é½
+#åè©-åºæåè©-å°å-ä¸è¬
+#
+#  noun-proper-place-country: Country names. 
+#  e.g. æ¥æ¬, ãªã¼ã¹ãã©ãªã¢
+#åè©-åºæåè©-å°å-å½
+#
+#  noun-pronoun: Pronouns where the sub-classification is undefined
+#åè©-ä»£åè©
+#
+#  noun-pronoun-misc: miscellaneous pronouns: 
+#  e.g. ãã, ãã, ããã¤, ããªã, ãã¡ãã¡, ããã¤, 
ã©ãã, ãªã«, ã¿ãªãã, ã¿ããª, ãããã, ãããã
+#åè©-ä»£åè©-ä¸è¬
+#
+#  noun-pronoun-contraction: Spoken language contraction made by combining a 
+#  pronoun and the particle 'wa'.
+#  e.g. ããã, ããã, ãããã, ããã, ãããã 
+#åè©-ä»£åè©-ç¸®ç´
+#
+#  noun-adverbial: Temporal nouns such as names of days or months that behave 
+#  like adverbs. Nouns that represent amount or ratios and can be used 
adverbially,
+#  e.g. éæ, ä¸æ, åå¾, å°é
+#åè©-å¯è©å¯è½
+#
+#  noun-verbal: Nouns that take arguments with case and can appear followed by 
+#  'suru' and related verbs (ãã, ã§ãã, ãªãã, ãã ãã)
+#  e.g. ã¤ã³ããã, æç, æªå, æªæ¦è¦é, ä¸å®å¿, ä¸åã
+#åè©-ãµå¤æ¥ç¶
+#
+#  noun-adjective-base: The base form of adjectives, words that appear before 
ãª ("na")
+#  e.g. å¥åº·, å®æ, é§ç®, ã ã
+#åè©-å½¢å®¹åè©èªå¹¹
+#
+#  noun-numeric: Arabic numbers, Chinese numerals, and counters like ä½ 
(å), æ°.
+#  e.g. 0, 1, 2, ä½, æ°, å¹¾
+#åè©-æ°
+#
+#  noun-affix: noun affixes where the sub-classification is undefined
+#åè©-éèªç«
+#
+#  noun-affix-misc: Of adnominalizers, the case-marker ã® ("no"), and words 
that 
+#  attach to the base form of inflectional words, words that cannot be 
classified 
+#  into any of the other categories below. This category includes indefinite 
nouns.
+#  e.g. ããã¤ã, æ, ãã, ç²æ, æ°, ããã, å«ã, ãã, 
ç, ãã¨, äº, ãã¨, æ¯, ãã ã, æ¬¡ç¬¬, 
+#       é , ãã, æçº, ã¤ãã§, åºã§, ã¤ãã, ç©ãã, ç¹, 
ã©ãã, ã®, ã¯ã, ç, ã¯ãã¿, å¼¾ã¿, 
+#       æå, ãµã, ãµã, æ¯ã, ã»ã, æ¹, æ¨, ãã®, ç©, è, 
ãã, æ, ããã, æä»¥, ãã, è¨³,
+#       ãã, å²ã, å², ã-å£èª/, ãã-å£èª/
+#åè©-éèªç«-ä¸è¬
+#
+#  noun-affix-adverbial: noun affixes that that can behave as adverbs.
+#  e.g. ããã , é, ããã, æãå¥, ãã¨, å¾, ä½ã, ä»¥å¤, 
ä»¥é, ä»¥å¾, ä»¥ä¸, ä»¥å, ä¸æ¹, ãã, 
+#       ä¸, ãã¡, å, ãã, æã, ããã, éã, ãã, 
ã£ãã, çµæ, ãã, é , ãã, é, æä¸, ããªã, 
+#       æä¸, ããã, èªä½, ãã³, åº¦, ãã, çº, ã¤ã©, é½åº¦, 
ã¨ãã, éã, ã¨ã, æ, ã¨ãã, æ, 
+#       ã¨ãã, éç«¯, ãªã, ä¸, ã®ã¡, å¾, ã°ãã, å ´å, æ¥, 
ã¶ã, å, ã»ã, ä», ã¾ã, å, ã¾ã¾, 
+#       å, ä¾, ã¿ãã, ç¢å
+#åè©-éèªç«-å¯è©å¯è½
+#
+#  noun-affix-aux: noun affixes treated as å©åè© ("auxiliary verb") in 
school grammars 
+#  with the stem ãã(ã ) ("you(da)").
+#  e.g.  ãã, ãã, æ§ (ãã)
+#åè©-éèªç«-å©åè©èªå¹¹
+#  
+#  noun-affix-adjective-base: noun affixes that can connect to the indeclinable
+#  connection form ãª (aux "da").
+#  e.g. ã¿ãã, ãµã
+#åè©-éèªç«-å½¢å®¹åè©èªå¹¹
+#
+#  noun-special: special nouns where the sub-classification is undefined.
+#åè©-ç¹æ®
+#
+#  noun-special-aux: The ããã  ("souda") stem form that is used for 
reporting news, is 
+#  treated as å©åè© ("auxiliary verb") in school grammars, and attach to 
the base 
+#  form of inflectional words.
+#  e.g. ãã
+#åè©-ç¹æ®-å©åè©èªå¹¹
+#
+#  noun-suffix: noun suffixes where the sub-classification is undefined.
+#åè©-æ¥å°¾
+#
+#  noun-suffix-misc: Of the nouns or stem forms of other parts of speech that 
connect 
+#  to ã¬ã« or ã¿ã¤ and can combine into compound nouns, words that cannot 
be classified into
+#  any of the other categories below. In general, this category is more 
inclusive than 
+#  æ¥å°¾èª ("suffix") and is usually the last element in a compound noun.
+#  e.g. ãã, ãã, æ¹, ç²æ (ãã), ããã, ãã¿, æ°å³, 
ããã¿, (ï½ãã) ã, æ¬¡ç¬¬, æ¸ (ã) ã¿,
+#       ãã, (ã§ã)ã£ã, æ, è¦³, æ§, å¦, é¡, é¢, ç¨
+#åè©-æ¥å°¾-ä¸è¬
+#
+#  noun-suffix-person: Suffixes that form nouns and attach to person names 
more often
+#  than other nouns.
+#  e.g. å, æ§, è
+#åè©-æ¥å°¾-äººå
+#
+#  noun-suffix-place: Suffixes that form nouns and attach to place names more 
often 
+#  than other nouns.
+#  e.g. çº, å¸, ç
+#åè©-æ¥å°¾-å°å
+#
+#  noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, 
those that 
+#  can appear before ã¹ã« ("suru").
+#  e.g. å, è¦, åã, å¥ã, è½ã¡, è²·ã
+#åè©-æ¥å°¾-ãµå¤æ¥ç¶
+#
+#  noun-suffix-aux: The stem form of ããã  (æ§æ) that is used to 
indicate conditions, 
+#  is treated as å©åè© ("auxiliary verb") in school grammars, and attach 
to the 
+#  conjunctive form of inflectional words.
+#  e.g. ãã
+#åè©-æ¥å°¾-å©åè©èªå¹¹
+#
+#  noun-suffix-adjective-base: Suffixes that attach to other nouns or the 
conjunctive 
+#  form of inflectional words and appear before the copula ã  ("da").
+#  e.g. ç, ã, ãã¡
+#åè©-æ¥å°¾-å½¢å®¹åè©èªå¹¹
+#
+#  noun-suffix-adverbial: Suffixes that attach to other nouns and can behave 
as adverbs.
+#  e.g. å¾ (ã), ä»¥å¾, ä»¥é, ä»¥å, åå¾, ä¸, æ«, ä¸, æ (ã)
+#åè©-æ¥å°¾-å¯è©å¯è½
+#
+#  noun-suffix-classifier: Suffixes that attach to numbers and form nouns. 
This category 
+#  is more inclusive than å©æ°è© ("classifier") and includes common nouns 
that attach 
+#  to numbers.
+#  e.g. å, ã¤, æ¬, å, ãã¼ã»ã³ã, cm, kg, ã«æ, ãå½, åºç», 
æé, æå
+#åè©-æ¥å°¾-å©æ°è©
+#
+#  noun-suffix-special: Special suffixes that mainly attach to inflecting 
words.
+#  e.g. (æ¥½ã) ã, (èã) æ¹
+#åè©-æ¥å°¾-ç¹æ®
+#
+#  noun-suffix-conjunctive: Nouns that behave like conjunctions and join two 
words 
+#  together.
+#  e.g. (æ¥æ¬) å¯¾ (ã¢ã¡ãªã«), å¯¾ (ã¢ã¡ãªã«), (3) å¯¾ (5), (å¥³åª) 
å¼ (ä¸»å©¦)
+#åè©-æ¥ç¶è©ç
+#
+#  noun-verbal_aux: Nouns that attach to the conjunctive particle ã¦ ("te") 
and are 
+#  semantically verb-like.
+#  e.g. ããã, ãè¦§, å¾¡è¦§, é æ´
+#åè©-åè©éèªç«ç
+#
+#  noun-quotation: text that cannot be segmented into words, proverbs, Chinese 
poetry, 
+#  dialects, English, etc. Currently, the only entry for åè© 
å¼ç¨æåå ("noun quotation") 
+#  is ããã ("iwaku").
+#åè©-å¼ç¨æåå
+#
+#  noun-nai_adjective: Words that appear before the auxiliary verb ãªã 
("nai") and
+#  behave like an adjective.
+#  e.g. ç³ãè¨³, ä»æ¹, ã¨ãã§ã, éã
+#åè©-ãã¤å½¢å®¹è©èªå¹¹
+#
+#####
+#  prefix: unclassified prefixes
+#æ¥é è©
+#
+#  prefix-nominal: Prefixes that attach to nouns (including adjective stem 
forms) 
+#  excluding numerical expressions.
+#  e.g. ã (æ°´), æ (æ°), å (ç¤¾), æ (ï½æ°), é« (åè³ª), ã 
(è¦äº), ã (ç«æ´¾)
+#æ¥é è©-åè©æ¥ç¶
+#
+#  prefix-verbal: Prefixes that attach to the imperative form of a verb or a 
verb
+#  in conjunctive form followed by ãªã/ãªãã/ãã ãã.
+#  e.g. ã (èªã¿ãªãã), ã (åº§ã)
+#æ¥é è©-åè©æ¥ç¶
+#
+#  prefix-adjectival: Prefixes that attach to adjectives.
+#  e.g. ã (å¯ãã§ããã), ãã« (ã§ãã)
+#æ¥é è©-å½¢å®¹è©æ¥ç¶
+#
+#  prefix-numerical: Prefixes that attach to numerical expressions.
+#  e.g. ç´, ããã, æ¯æ
+#æ¥é è©-æ°æ¥ç¶
+#
+#####
+#  verb: unclassified verbs
+#åè©
+#
+#  verb-main:
+#åè©-èªç«
+#
+#  verb-auxiliary:
+#åè©-éèªç«
+#
+#  verb-suffix:
+#åè©-æ¥å°¾
+#
+#####
+#  adjective: unclassified adjectives
+#å½¢å®¹è©
+#
+#  adjective-main:
+#å½¢å®¹è©-èªç«
+#
+#  adjective-auxiliary:
+#å½¢å®¹è©-éèªç«
+#
+#  adjective-suffix:
+#å½¢å®¹è©-æ¥å°¾
+#
+#####
+#  adverb: unclassified adverbs
+#å¯è©
+#
+#  adverb-misc: Words that can be segmented into one unit and where adnominal 
+#  modification is not possible.
+#  e.g. ãããããã, å¤å
+#å¯è©-ä¸è¬
+#
+#  adverb-particle_conjunction: Adverbs that can be followed by ã®, ã¯, ã«, 
+#  ãª, ãã, ã , etc.
+#  e.g. ãããªã«, ãããªã«, ãããªã«, ãªã«ã, ãªãã§ã
+#å¯è©-å©è©é¡æ¥ç¶
+#
+#####
+#  adnominal: Words that only have noun-modifying forms.
+#  e.g. ãã®, ãã®, ãã®, ã©ã®, ãããã, ãªãããã®, 
ä½ããã®, ããããª, ãããã, ãããã, ãããã, 
+#       ã©ããã, ãããª, ãããª, ãããª, ã©ããª, å¤§ããª, 
å°ããª, ããããª, ã»ãã®, ãããã, 
+#       ã(, ã) ãã (ãã¨ãªãã)ã, å¾®ããã, å ããã, 
åãªã, ãããªã, æãããåã, äº¡ã
+#é£ä½è©
+#
+#####
+#  conjunction: Conjunctions that can occur independently.
+#  e.g. ã, ããã©ã, ããã¦, ããã, ããã©ããã
+æ¥ç¶è©
+#
+#####
+#  particle: unclassified particles.
+å©è©
+#
+#  particle-case: case particles where the subclassification is undefined.
+å©è©-æ ¼å©è©
+#
+#  particle-case-misc: Case particles.
+#  e.g. ãã, ã, ã§, ã¨, ã«, ã¸, ãã, ã, ã®, ã«ã¦
+å©è©-æ ¼å©è©-ä¸è¬
+#
+#  particle-case-quote: the "to" that appears after nouns, a personâs 
speech, 
+#  quotation marks, expressions of decisions from a meeting, reasons, 
judgements,
+#  conjectures, etc.
+#  e.g. ( ã ) ã¨ (è¿°ã¹ã.), ( ã§ãã) ã¨ (ãã¦å·è¡ç¶äº...)
+å©è©-æ ¼å©è©-å¼ç¨
+#
+#  particle-case-compound: Compounds of particles and verbs that mainly behave 
+#  like case particles.
+#  e.g. ã¨ãã, ã¨ãã£ã, ã¨ããã, ã¨ãã¦, ã¨ã¨ãã«, ã¨å
±ã«, ã§ãã£ã¦, ã«ããã£ã¦, ã«å½ãã£ã¦, ã«å½ã£ã¦,
+#       ã«ããã, ã«å½ãã, ã«å½ã, ã«å½ãã, ã«ããã, 
ã«ããã¦, ã«æ¼ãã¦,ã«æ¼ã¦, ã«ããã, ã«æ¼ãã, 
+#       ã«ãã, ã«ããã¦, ã«ããã, ã«é¢ã, ã«ãããã¦, 
ã«é¢ãã¦, ã«ãããã, ã«é¢ãã, ã«éã, 
+#       ã«éãã¦, ã«ãããã, ã«å¾ã, ã«å¾ã, 
ã«ãããã£ã¦, ã«å¾ã£ã¦, ã«ããã, ã«å¯¾ã, ã«ãããã¦, 
+#       ã«å¯¾ãã¦, ã«ãããã, ã«å¯¾ãã, ã«ã¤ãã¦, ã«ã¤ã, 
ã«ã¤ã, ã«ã¤ãã¦, ã«ã¤ã, ã«ã¤ãã¦, ã«ã¨ã£ã¦,
+#       ã«ã¨ã, ã«ã¾ã¤ãã, ã«ãã£ã¦, ã«ä¾ã£ã¦, ã«å ã£ã¦, 
ã«ãã, ã«ä¾ã, ã«å ã, ã«ãã, ã«ä¾ã, ã«å ã, 
+#       ã«ããã£ã¦, ã«ããã, ããã£ã¦, ãä»¥ã£ã¦, ãéã, 
ãéãã¦, ãéãã¦, ãããã£ã¦, ãããã, ãããã,
+#       ã£ã¦-å£èª/, ã¡ãã-é¢è¥¿å¼ãã¨ããã/, (ä½) ã¦ãã 
(äºº)-å£èª/, ã£ã¦ãã-å£èª/, ã¨ããµ, ã¨ãããµ
+å©è©-æ ¼å©è©-é£èª
+#
+#  particle-conjunctive:
+#  e.g. ãã, ããã«ã¯, ã, ããã©, ããã©ã, ãã©, ã, 
ã¤ã¤, ã¦, ã§, ã¨, ã¨ããã, ã©ããã, ã¨ã, ã©ã, 
+#       ãªãã, ãªã, ã®ã§, ã®ã«, ã°, ãã®ã®, ã ( ãã), 
ãããªã, (ããã) ãã(ãããªã)-å£èª/, 
+#       (è¡ã£) ã¡ã(ãããªã)-å£èª/, (è¨ã£) ãã£ã¦ 
(ãããããªã)-å£èª/, (ããããªã)ã£ãã£ã¦ (å¹³æ°)-å£èª/
+å©è©-æ¥ç¶å©è©
+#
+#  particle-dependency:
+#  e.g. ãã, ãã, ãã, ãã, ã¯, ã, ã
+å©è©-ä¿å©è©
+#
+#  particle-adverbial:
+#  e.g. ãã¦ã, ãã, ããã, ä½, ããã, ãã, (å¦æ ¡) 
ãã(ãããæµè¡ã£ã¦ãã)-å£èª/, 
+#       (ãã)ããã (ãããªã)-å£èª/, ãã¤, (ç§) ãªã, 
ãªã©, (ç§) ãªã (ã«), (åç) ãªãã (å¤§å«ã)-å£èª/,
+#       (ç§) ãªãã, (åç) ãªãã¦ (å¤§å«ã)-å£èª/, ã®ã¿, ã 
ã, (ç§) ã ã£ã¦-å£èª/, ã ã«, 
+#       (å½¼)ã£ãã-å£èª/, (ãè¶) ã§ã (ããã), ç (ã¨ã), 
(ä»å¾) ã¨ã, ã°ãã, ã°ã£ã-å£èª/, ã°ã£ãã-å£èª/,
+#       ã»ã©, ç¨, ã¾ã§, è¿, (èª°) ã (ã)([å©è©-æ ¼å©è©] ããã³ 
[å©è©-ä¿å©è©] ã®åã«ä½ç½®ããããã)
+å©è©-å¯å©è©
+#
+#  particle-interjective: particles with interjective grammatical roles.
+#  e.g. (æ¾å³¶) ã
+å©è©-éæå©è©
+#
+#  particle-coordinate:
+#  e.g. ã¨, ãã, ã ã®, ã ã, ã¨ã, ãªã, ã, ãã
+å©è©-ä¸¦ç«å©è©
+#
+#  particle-final:
+#  e.g. ãã, ããã, ã, ã, (ã )ã£ã-å£èª/, (ã¨ã¾ã£ã¦ã) 
ã§-æ¹è¨/, ãª, ã, ãªã-å£èª/, ã, ã, ã, 
+#       ãã-å£èª/, ãã-å£èª/, ãã-æ¹è¨/, ã®, ã®ã-å£èª/, 
ã, ã, ã¨, ãã-å£èª/, ã, ãã-å£èª/
+å©è©-çµå©è©
+#
+#  particle-adverbial/conjunctive/final: The particle "ka" when unknown 
whether it is 
+#  adverbial, conjunctive, or sentence final. For example:
+#       (a) ãA ã B ãã. Ex:ã(å½åã§éç¨ãã) 
ã,(æµ·å¤ã§éç¨ãã) ã (.)ã
+#       (b) Inside an adverb phrase. Ex:ã(å¹¸ãã¨ãã) ã (, æ»è
ã¯ããªãã£ã.)ã
+#           ã(ç¥ããå±ãããã) ã (, è©¦é¨ã«åæ ¼ãã.)ã
+#       (c) ããã®ããã«ã. Ex:ã(ä½ããªãã£ã) ã 
(ã®ããã«æ¯ãèã£ã.)ã
+#  e.g. ã
+å©è©-å¯å©è©ï¼ä¸¦ç«å©è©ï¼çµå©è©
+#
+#  particle-adnominalizer: The "no" that attaches to nouns and modifies 
+#  non-inflectional words.
+å©è©-é£ä½å
+#
+#  particle-adnominalizer: The "ni" and "to" that appear following nouns and 
adverbs 
+#  that are giongo, giseigo, or gitaigo.
+#  e.g. ã«, ã¨
+å©è©-å¯è©å
+#
+#  particle-special: A particle that does not fit into one of the above 
classifications. 
+#  This includes particles that are used in Tanka, Haiku, and other poetry.
+#  e.g. ããª, ãã, ( ããã ãã) ã«, (ããã) 
ã«ã(ãããã), (ä¿º) ã (å®¶)
+å©è©-ç¹æ®
+#
+#####
+#  auxiliary-verb:
+å©åè©
+#
+#####
+#  interjection: Greetings and other exclamations.
+#  e.g. ãã¯ãã, ãã¯ãããããã¾ã, ããã«ã¡ã¯, 
ããã°ãã¯, ãããã¨ã, ã©ãããããã¨ã, 
ãããã¨ããããã¾ã, 
+#       ããã ãã¾ã, ãã¡ãããã¾, ãããªã, ããããªã, 
ã¯ã, ããã, ããã, ããããªãã
+#æåè©
+#
+#####
+#  symbol: unclassified Symbols.
+è¨å·
+#
+#  symbol-misc: A general symbol not in one of the categories below.
+#  e.g. [ââ@$ãâ+]
+è¨å·-ä¸è¬
+#
+#  symbol-comma: Commas
+#  e.g. [,ã]
+è¨å·-èªç¹
+#
+#  symbol-period: Periods and full stops.
+#  e.g. [.ï¼ã]
+è¨å·-å¥ç¹
+#
+#  symbol-space: Full-width whitespace.
+è¨å·-ç©ºç½
+#
+#  symbol-open_bracket:
+#  e.g. [({ââãã]
+è¨å·-æ¬å¼§é
+#
+#  symbol-close_bracket:
+#  e.g. [)}ââããã]
+è¨å·-æ¬å¼§é
+#
+#  symbol-alphabetic:
+#è¨å·-ã¢ã«ãã¡ããã
+#
+#####
+#  other: unclassified other
+#ãã®ä»
+#
+#  other-interjection: Words that are hard to classify as noun-suffixes or 
+#  sentence-final particles.
+#  e.g. (ã )ã¡
+ãã®ä»-éæ
+#
+#####
+#  filler: Aizuchi that occurs during a conversation or sounds inserted as 
filler.
+#  e.g. ãã®, ããã¨, ãã¨
+ãã£ã©ã¼
+#
+#####
+#  non-verbal: non-verbal sound.
+éè¨èªé³
+#
+#####
+#  fragment:
+#èªæç
+#
+#####
+#  unknown: unknown part of speech.
+#æªç¥èª
+#
+##### End of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/stopwords.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/stopwords.txt 
b/src/Lucene.Net.Analysis.Kuromoji/stopwords.txt
new file mode 100644
index 0000000..d4321be
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/stopwords.txt
@@ -0,0 +1,127 @@
+#
+# This file defines a stopword set for Japanese.
+#
+# This set is made up of hand-picked frequent terms from segmented Japanese 
Wikipedia.
+# Punctuation characters and frequent kanji have mostly been left out.  See 
LUCENE-3745
+# for frequency lists, etc. that can be useful for making your own set (if 
desired)
+#
+# Note that there is an overlap between these stopwords and the terms stopped 
when used
+# in combination with the JapanesePartOfSpeechStopFilter.  When editing this 
file, note
+# that comments are not allowed on the same line as stopwords.
+#
+# Also note that stopping is done in a case-insensitive manner.  Change your 
StopFilter
+# configuration if you need case-sensitive stopping.  Lastly, note that 
stopping is done
+# using the same character width as the entries in this file.  Since this 
StopFilter is
+# normally done after a CJKWidthFilter in your chain, you would usually want 
your romaji
+# entries to be in half-width and your kana entries to be in full-width.
+#
+ã®
+ã«
+ã¯
+ã
+ã
+ã
+ã§
+ã¦
+ã¨
+ã
+ã
+ã
+ãã
+ãã
+ã
+ãã
+ãã
+ãª
+ãã¨
+ã¨ãã¦
+ã
+ã
+ãã
+ãªã©
+ãªã£
+ãªã
+ãã®
+ãã
+ãã®
+ãã£
+ãã
+ã¾ã
+ãã®
+ã¨ãã
+ãã
+ã¾ã§
+ãã
+ãªã
+ã¸
+ã
+ã 
+ãã
+ã«ãã£ã¦
+ã«ãã
+ãã
+ãã
+ã«ãã
+ã
+ãªã
+ããã
+ã«ããã¦
+ã°
+ãªãã£
+ãªã
+ããã
+ã«ã¤ãã¦
+ã
+ã ã£
+ãã®å¾
+ã§ãã
+ãã
+ã
+ã®ã§
+ãªã
+ã®ã¿
+ã§ã
+ã
+ã¤
+ã«ããã
+ããã³
+ãã
+ããã«
+ã§ã
+ã
+ãã
+ãã®ä»
+ã«é¢ãã
+ãã¡
+ã¾ã
+ã
+ãªã
+ã«å¯¾ãã¦
+ç¹ã«
+ãã
+åã³
+ããã
+ã¨ã
+ã§ã¯
+ã«ã¦
+ã»ã
+ãªãã
+ãã¡
+ããã¦
+ã¨ã¨ãã«
+ãã ã
+ãã¤ã¦
+ãããã
+ã¾ãã¯
+ã
+ã»ã©
+ãã®ã®
+ã«å¯¾ãã
+ã»ã¨ãã©
+ã¨å±ã«
+ã¨ãã£ã
+ã§ã
+ã¨ã
+ã¨ãã
+ãã
+##### End of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/TestTokenInfoDictionary.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/TestTokenInfoDictionary.cs 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/TestTokenInfoDictionary.cs
new file mode 100644
index 0000000..dd305a4
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/TestTokenInfoDictionary.cs
@@ -0,0 +1,114 @@
+ï»¿using Lucene.Net.Analysis.Ja.Util;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Fst;
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Ja.Dict
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class TestTokenInfoDictionary : LuceneTestCase
+    {
+        /// <summary>enumerates the entire FST/lookup data and just does basic 
sanity checks</summary>
+        [Test]
+        public void TestEnumerateAll()
+        {
+            // just for debugging
+            int numTerms = 0;
+            int numWords = 0;
+            int lastWordId = -1;
+            int lastSourceId = -1;
+            TokenInfoDictionary tid = TokenInfoDictionary.GetInstance();
+            ConnectionCosts matrix = ConnectionCosts.GetInstance();
+            FST<long?> fst = tid.FST.InternalFST;
+            Int32sRefFSTEnum<long?> fstEnum = new Int32sRefFSTEnum<long?>(fst);
+            Int32sRefFSTEnum.InputOutput<long?> mapping;
+            Int32sRef scratch = new Int32sRef();
+            while ((mapping = fstEnum.Next()) != null)
+            {
+                numTerms++;
+                Int32sRef input = mapping.Input;
+                char[] chars = new char[input.Length];
+                for (int i = 0; i < chars.Length; i++)
+                {
+                    chars[i] = (char)input.Int32s[input.Offset + i];
+                }
+                assertTrue(UnicodeUtil.ValidUTF16String(new string(chars)));
+
+                long? output = mapping.Output;
+                int sourceId = (int)output.Value;
+                // we walk in order, terms, sourceIds, and wordIds should 
always be increasing
+                assertTrue(sourceId > lastSourceId);
+                lastSourceId = sourceId;
+                tid.LookupWordIds(sourceId, scratch);
+                for (int i = 0; i < scratch.Length; i++)
+                {
+                    numWords++;
+                    int wordId = scratch.Int32s[scratch.Offset + i];
+                    assertTrue(wordId > lastWordId);
+                    lastWordId = wordId;
+
+                    String baseForm = tid.GetBaseForm(wordId, chars, 0, 
chars.Length);
+                    assertTrue(baseForm == null || 
UnicodeUtil.ValidUTF16String(baseForm));
+
+                    String inflectionForm = tid.GetInflectionForm(wordId);
+                    assertTrue(inflectionForm == null || 
UnicodeUtil.ValidUTF16String(inflectionForm));
+                    if (inflectionForm != null)
+                    {
+                        // check that its actually an ipadic inflection form
+                        
assertNotNull(ToStringUtil.GetInflectedFormTranslation(inflectionForm));
+                    }
+
+                    String inflectionType = tid.GetInflectionType(wordId);
+                    assertTrue(inflectionType == null || 
UnicodeUtil.ValidUTF16String(inflectionType));
+                    if (inflectionType != null)
+                    {
+                        // check that its actually an ipadic inflection type
+                        
assertNotNull(ToStringUtil.GetInflectionTypeTranslation(inflectionType));
+                    }
+
+                    int leftId = tid.GetLeftId(wordId);
+                    int rightId = tid.GetRightId(wordId);
+
+                    matrix.Get(rightId, leftId);
+
+                    tid.GetWordCost(wordId);
+
+                    String pos = tid.GetPartOfSpeech(wordId);
+                    assertNotNull(pos);
+                    assertTrue(UnicodeUtil.ValidUTF16String(pos));
+                    // check that its actually an ipadic pos tag
+                    assertNotNull(ToStringUtil.GetPOSTranslation(pos));
+
+                    String pronunciation = tid.GetPronunciation(wordId, chars, 
0, chars.Length);
+                    assertNotNull(pronunciation);
+                    assertTrue(UnicodeUtil.ValidUTF16String(pronunciation));
+
+                    String reading = tid.GetReading(wordId, chars, 0, 
chars.Length);
+                    assertNotNull(reading);
+                    assertTrue(UnicodeUtil.ValidUTF16String(reading));
+                }
+            }
+            if (VERBOSE)
+            {
+                Console.WriteLine("checked " + numTerms + " terms, " + 
numWords + " words.");
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/UserDictionaryTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/UserDictionaryTest.cs 
b/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/UserDictionaryTest.cs
new file mode 100644
index 0000000..f899476
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/UserDictionaryTest.cs
@@ -0,0 +1,90 @@
+ï»¿using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Ja.Dict
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class UserDictionaryTest : LuceneTestCase
+    {
+        [Test]
+        public void TestLookup()
+        {
+            UserDictionary dictionary = TestJapaneseTokenizer.ReadDict();
+            String s = "é¢è¥¿å½éç©ºæ¸¯ã«è¡ã£ã";
+            int[][] dictionaryEntryResult = dictionary.Lookup(s.toCharArray(), 
0, s.Length);
+            // Length should be three é¢è¥¿, å½é, ç©ºæ¸¯
+            assertEquals(3, dictionaryEntryResult.Length);
+
+            // Test positions
+            assertEquals(0, dictionaryEntryResult[0][1]); // index of é¢è¥¿
+            assertEquals(2, dictionaryEntryResult[1][1]); // index of å½é
+            assertEquals(4, dictionaryEntryResult[2][1]); // index of ç©ºæ¸¯
+
+            // Test lengths
+            assertEquals(2, dictionaryEntryResult[0][2]); // length of é¢è¥¿
+            assertEquals(2, dictionaryEntryResult[1][2]); // length of å½é
+            assertEquals(2, dictionaryEntryResult[2][2]); // length of ç©ºæ¸¯
+
+            s = "é¢è¥¿å½éç©ºæ¸¯ã¨é¢è¥¿å½éç©ºæ¸¯ã«è¡ã£ã";
+            int[][] dictionaryEntryResult2 = 
dictionary.Lookup(s.toCharArray(), 0, s.Length);
+            // Length should be six 
+            assertEquals(6, dictionaryEntryResult2.Length);
+        }
+
+        [Test]
+        public void TestReadings()
+        {
+            UserDictionary dictionary = TestJapaneseTokenizer.ReadDict();
+            int[]
+                []
+                result = dictionary.Lookup("æ¥æ¬çµæ¸æ°è".toCharArray(), 
0, 6);
+            assertEquals(3, result.Length);
+            int wordIdNihon = result[0]
+                [0]; // wordId of æ¥æ¬ in æ¥æ¬çµæ¸æ°è
+            assertEquals("ããã³", dictionary.GetReading(wordIdNihon, 
"æ¥æ¬".toCharArray(), 0, 2));
+
+            result = dictionary.Lookup("æéé¾".toCharArray(), 0, 3);
+            assertEquals(1, result.Length);
+            int wordIdAsashoryu = result[0]
+                [0]; // wordId for æéé¾
+            assertEquals("ã¢ãµã·ã§ã¦ãªã¥ã¦", 
dictionary.GetReading(wordIdAsashoryu, "æéé¾".toCharArray(), 0, 3));
+        }
+
+        [Test]
+        public void TestPartOfSpeech()
+        {
+            UserDictionary dictionary = TestJapaneseTokenizer.ReadDict();
+            int[]
+                []
+                result = dictionary.Lookup("æ¥æ¬çµæ¸æ°è".toCharArray(), 
0, 6);
+            assertEquals(3, result.Length);
+            int wordIdKeizai = result[1]
+                [0]; // wordId of çµæ¸ in æ¥æ¬çµæ¸æ°è
+            assertEquals("ã«ã¹ã¿ã åè©", 
dictionary.GetPartOfSpeech(wordIdKeizai));
+        }
+
+        [Test]
+        public void TestRead()
+        {
+            UserDictionary dictionary = TestJapaneseTokenizer.ReadDict();
+            assertNotNull(dictionary);
+        }
+    }
+}

[07/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji + tests

Reply via email to