http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Util/ToStringUtil.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Util/ToStringUtil.cs b/src/Lucene.Net.Analysis.Kuromoji/Util/ToStringUtil.cs new file mode 100644 index 0000000..95e2703 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Util/ToStringUtil.cs @@ -0,0 +1,1401 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace Lucene.Net.Analysis.Ja.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Utility class for english translations of morphological data, + /// used only for debugging. + /// </summary> + public static class ToStringUtil + { + // a translation map for parts of speech, only used for reflectWith + private static readonly IDictionary<string, string> posTranslations = new Dictionary<string, string>(StringComparer.Ordinal) + { + { "åè©", "noun"}, + { "åè©-ä¸è¬", "noun-common" }, + { "åè©-åºæåè©", "noun-proper" }, + { "åè©-åºæåè©-ä¸è¬", "noun-proper-misc" }, + { "åè©-åºæåè©-人å", "noun-proper-person" }, + { "åè©-åºæåè©-人å-ä¸è¬", "noun-proper-person-misc" }, + { "åè©-åºæåè©-人å-å§", "noun-proper-person-surname" }, + { "åè©-åºæåè©-人å-å", "noun-proper-person-given_name" }, + { "åè©-åºæåè©-çµç¹", "noun-proper-organization" }, + { "åè©-åºæåè©-å°å", "noun-proper-place" }, + { "åè©-åºæåè©-å°å-ä¸è¬", "noun-proper-place-misc" }, + { "åè©-åºæåè©-å°å-å½", "noun-proper-place-country" }, + { "åè©-代åè©", "noun-pronoun" }, + { "åè©-代åè©-ä¸è¬", "noun-pronoun-misc" }, + { "åè©-代åè©-縮ç´", "noun-pronoun-contraction" }, + { "åè©-å¯è©å¯è½", "noun-adverbial" }, + { "åè©-ãµå¤æ¥ç¶", "noun-verbal" }, + { "åè©-形容åè©èªå¹¹", "noun-adjective-base" }, + { "åè©-æ°", "noun-numeric" }, + { "åè©-éèªç«", "noun-affix" }, + { "åè©-éèªç«-ä¸è¬", "noun-affix-misc" }, + { "åè©-éèªç«-å¯è©å¯è½", "noun-affix-adverbial" }, + { "åè©-éèªç«-å©åè©èªå¹¹", "noun-affix-aux" }, + { "åè©-éèªç«-形容åè©èªå¹¹", "noun-affix-adjective-base" }, + { "åè©-ç¹æ®", "noun-special" }, + { "åè©-ç¹æ®-å©åè©èªå¹¹", "noun-special-aux" }, + { "åè©-æ¥å°¾", "noun-suffix" }, + { "åè©-æ¥å°¾-ä¸è¬", "noun-suffix-misc" }, + { "åè©-æ¥å°¾-人å", "noun-suffix-person" }, + { "åè©-æ¥å°¾-å°å", "noun-suffix-place" }, + { "åè©-æ¥å°¾-ãµå¤æ¥ç¶", "noun-suffix-verbal" }, + { "åè©-æ¥å°¾-å©åè©èªå¹¹", "noun-suffix-aux" }, + { "åè©-æ¥å°¾-形容åè©èªå¹¹", "noun-suffix-adjective-base" }, + { "åè©-æ¥å°¾-å¯è©å¯è½", "noun-suffix-adverbial" }, + { "åè©-æ¥å°¾-婿°è©", "noun-suffix-classifier" }, + { "åè©-æ¥å°¾-ç¹æ®", "noun-suffix-special" }, + { "åè©-æ¥ç¶è©ç", "noun-suffix-conjunctive" }, + { "åè©-åè©éèªç«ç", "noun-verbal_aux" }, + { "åè©-å¼ç¨æåå", "noun-quotation" }, + { "åè©-ãã¤å½¢å®¹è©èªå¹¹", "noun-nai_adjective" }, + { "æ¥é è©", "prefix" }, + { "æ¥é è©-åè©æ¥ç¶", "prefix-nominal" }, + { "æ¥é è©-åè©æ¥ç¶", "prefix-verbal" }, + { "æ¥é è©-å½¢å®¹è©æ¥ç¶", "prefix-adjectival" }, + { "æ¥é è©-æ°æ¥ç¶", "prefix-numerical" }, + { "åè©", "verb" }, + { "åè©-èªç«", "verb-main" }, + { "åè©-éèªç«", "verb-auxiliary" }, + { "åè©-æ¥å°¾", "verb-suffix" }, + { "形容è©", "adjective" }, + { "形容è©-èªç«", "adjective-main" }, + { "形容è©-éèªç«", "adjective-auxiliary" }, + { "形容è©-æ¥å°¾", "adjective-suffix" }, + { "å¯è©", "adverb" }, + { "å¯è©-ä¸è¬", "adverb-misc" }, + { "å¯è©-å©è©é¡æ¥ç¶", "adverb-particle_conjunction" }, + { "é£ä½è©", "adnominal" }, + { "æ¥ç¶è©", "conjunction" }, + { "å©è©", "particle" }, + { "å©è©-æ ¼å©è©", "particle-case" }, + { "å©è©-æ ¼å©è©-ä¸è¬", "particle-case-misc" }, + { "å©è©-æ ¼å©è©-å¼ç¨", "particle-case-quote" }, + { "å©è©-æ ¼å©è©-é£èª", "particle-case-compound" }, + { "å©è©-æ¥ç¶å©è©", "particle-conjunctive" }, + { "å©è©-ä¿å©è©", "particle-dependency" }, + { "å©è©-å¯å©è©", "particle-adverbial" }, + { "å©è©-éæå©è©", "particle-interjective" }, + { "å©è©-並ç«å©è©", "particle-coordinate" }, + { "å©è©-çµå©è©", "particle-final" }, + { "å©è©-å¯å©è©ï¼ä¸¦ç«å©è©ï¼çµå©è©", "particle-adverbial/conjunctive/final" }, + { "å©è©-é£ä½å", "particle-adnominalizer" }, + { "å©è©-å¯è©å", "particle-adnominalizer" }, + { "å©è©-ç¹æ®", "particle-special" }, + { "å©åè©", "auxiliary-verb" }, + { "æåè©", "interjection" }, + { "è¨å·", "symbol" }, + { "è¨å·-ä¸è¬", "symbol-misc" }, + { "è¨å·-å¥ç¹", "symbol-period" }, + { "è¨å·-èªç¹", "symbol-comma" }, + { "è¨å·-空ç½", "symbol-space" }, + { "è¨å·-æ¬å¼§é", "symbol-open_bracket" }, + { "è¨å·-æ¬å¼§é", "symbol-close_bracket" }, + { "è¨å·-ã¢ã«ãã¡ããã", "symbol-alphabetic" }, + { "ãã®ä»", "other" }, + { "ãã®ä»-éæ", "other-interjection" }, + { "ãã£ã©ã¼", "filler" }, + { "éè¨èªé³", "non-verbal" }, + { "èªæç", "fragment" }, + { "æªç¥èª", "unknown" } + }; + + + /// <summary> + /// Get the english form of a POS tag + /// </summary> + public static string GetPOSTranslation(string s) + { + string result; + posTranslations.TryGetValue(s, out result); + return result; + } + + // a translation map for inflection types, only used for reflectWith + private static readonly IDictionary<string, string> inflTypeTranslations = new Dictionary<string, string>(StringComparer.Ordinal) + { + { "*", "*" }, + { "形容è©ã»ã¢ã¦ãªæ®µ", "adj-group-a-o-u" }, + { "形容è©ã»ã¤æ®µ", "adj-group-i" }, + { "形容è©ã»ã¤ã¤", "adj-group-ii" }, + { "ä¸å¤åå", "non-inflectional" }, + { "ç¹æ®ã»ã¿", "special-da" }, + { "ç¹æ®ã»ã", "special-ta" }, + { "æèªã»ã´ãã·", "classical-gotoshi" }, + { "ç¹æ®ã»ã¸ã£", "special-ja" }, + { "ç¹æ®ã»ãã¤", "special-nai" }, + { "äºæ®µã»ã©è¡ç¹æ®", "5-row-cons-r-special" }, + { "ç¹æ®ã»ã", "special-nu" }, + { "æèªã»ã", "classical-ki" }, + { "ç¹æ®ã»ã¿ã¤", "special-tai" }, + { "æèªã»ãã·", "classical-beshi" }, + { "ç¹æ®ã»ã¤", "special-ya" }, + { "æèªã»ãã¸", "classical-maji" }, + { "ä¸äºã»ã¿è¡", "2-row-lower-cons-t" }, + { "ç¹æ®ã»ãã¹", "special-desu" }, + { "ç¹æ®ã»ãã¹", "special-masu" }, + { "äºæ®µã»ã©è¡ã¢ã«", "5-row-aru" }, + { "æèªã»ããª", "classical-nari" }, + { "æèªã»ãª", "classical-ri" }, + { "æèªã»ã±ãª", "classical-keri" }, + { "æèªã»ã«", "classical-ru" }, + { "äºæ®µã»ã«è¡ã¤é³ä¾¿", "5-row-cons-k-i-onbin" }, + { "äºæ®µã»ãµè¡", "5-row-cons-s" }, + { "䏿®µ", "1-row" }, + { "äºæ®µã»ã¯è¡ä¿é³ä¾¿", "5-row-cons-w-cons-onbin" }, + { "äºæ®µã»ãè¡", "5-row-cons-m" }, + { "äºæ®µã»ã¿è¡", "5-row-cons-t" }, + { "äºæ®µã»ã©è¡", "5-row-cons-r" }, + { "ãµå¤ã»âã¹ã«", "irregular-suffix-suru" }, + { "äºæ®µã»ã¬è¡", "5-row-cons-g" }, + { "ãµå¤ã»âãºã«", "irregular-suffix-zuru" }, + { "äºæ®µã»ãè¡", "5-row-cons-b" }, + { "äºæ®µã»ã¯è¡ã¦é³ä¾¿", "5-row-cons-w-u-onbin" }, + { "ä¸äºã»ãè¡", "2-row-lower-cons-d" }, + { "äºæ®µã»ã«è¡ä¿é³ä¾¿ã¦ã¯", "5-row-cons-k-cons-onbin-yuku" }, + { "ä¸äºã»ãè¡", "2-row-upper-cons-d" }, + { "äºæ®µã»ã«è¡ä¿é³ä¾¿", "5-row-cons-k-cons-onbin" }, + { "䏿®µã»å¾ã«", "1-row-eru" }, + { "åæ®µã»ã¿è¡", "4-row-cons-t" }, + { "äºæ®µã»ãè¡", "5-row-cons-n" }, + { "ä¸äºã»ãè¡", "2-row-lower-cons-h" }, + { "åæ®µã»ãè¡", "4-row-cons-h" }, + { "åæ®µã»ãè¡", "4-row-cons-b" }, + { "ãµå¤ã»ã¹ã«", "irregular-suru" }, + { "ä¸äºã»ãè¡", "2-row-upper-cons-h" }, + { "ä¸äºã»ãè¡", "2-row-lower-cons-m" }, + { "åæ®µã»ãµè¡", "4-row-cons-s" }, + { "ä¸äºã»ã¬è¡", "2-row-lower-cons-g" }, + { "ã«å¤ã»æ¥ã«", "kuru-kanji" }, + { "䏿®µã»ã¯ã¬ã«", "1-row-kureru" }, + { "ä¸äºã»å¾", "2-row-lower-u" }, + { "ã«å¤ã»ã¯ã«", "kuru-kana" }, + { "ã©å¤", "irregular-cons-r" }, + { "ä¸äºã»ã«è¡", "2-row-lower-cons-k" }, + }; + + + /// <summary> + /// Get the english form of inflection type + /// </summary> + public static string GetInflectionTypeTranslation(string s) + { + string result; + inflTypeTranslations.TryGetValue(s, out result); + return result; + } + + // a translation map for inflection forms, only used for reflectWith + private static readonly IDictionary<string, string> inflFormTranslations = new Dictionary<string, string>(StringComparer.Ordinal) + { + { "*", "*" }, + { "åºæ¬å½¢", "base" }, + { "æèªåºæ¬å½¢", "classical-base" }, + { "æªç¶ãæ¥ç¶", "imperfective-nu-connection" }, + { "æªç¶ã¦æ¥ç¶", "imperfective-u-connection" }, + { "é£ç¨ã¿æ¥ç¶", "conjunctive-ta-connection" }, + { "é£ç¨ãæ¥ç¶", "conjunctive-te-connection" }, + { "é£ç¨ã´ã¶ã¤æ¥ç¶", "conjunctive-gozai-connection" }, + { "ä½è¨æ¥ç¶", "uninflected-connection" }, + { "ä»®å®å½¢", "subjunctive" }, + { "å½ä»¤ï½ ", "imperative-e" }, + { "ä»®å®ç¸®ç´ï¼", "conditional-contracted-1" }, + { "ä»®å®ç¸®ç´ï¼", "conditional-contracted-2" }, + { "ã¬ã«æ¥ç¶", "garu-connection" }, + { "æªç¶å½¢", "imperfective" }, + { "é£ç¨å½¢", "conjunctive" }, + { "é³ä¾¿åºæ¬å½¢", "onbin-base" }, + { "é£ç¨ãæ¥ç¶", "conjunctive-de-connection" }, + { "æªç¶ç¹æ®", "imperfective-special" }, + { "å½ä»¤ï½", "imperative-i" }, + { "é£ç¨ãæ¥ç¶", "conjunctive-ni-connection" }, + { "å½ä»¤ï½ï½", "imperative-yo" }, + { "ä½è¨æ¥ç¶ç¹æ®", "adnominal-special" }, + { "å½ä»¤ï½ï½", "imperative-ro" }, + { "ä½è¨æ¥ç¶ç¹æ®ï¼", "uninflected-special-connection-2" }, + { "æªç¶ã¬ã«æ¥ç¶", "imperfective-reru-connection" }, + { "ç¾ä»£åºæ¬å½¢", "modern-base" }, + { "åºæ¬å½¢-ä¿é³ä¾¿", "base-onbin" }, // not sure about this + }; + + + /// <summary> + /// Get the english form of inflected form + /// </summary> + public static string GetInflectedFormTranslation(string s) + { + string result; + inflFormTranslations.TryGetValue(s, out result); + return result; + } + + /// <summary> + /// Romanize katakana with modified hepburn + /// </summary> + public static string GetRomanization(string s) + { + StringBuilder result = new StringBuilder(); + try + { + GetRomanization(result, s); + } + catch (IOException bogus) + { + throw new Exception(bogus.ToString(), bogus); + } + return result.ToString(); + } + + /// <summary> + /// Romanize katakana with modified hepburn + /// </summary> + // TODO: now that this is used by readingsfilter and not just for + // debugging, fix this to really be a scheme that works best with IMEs + public static void GetRomanization(StringBuilder builder, string s) + { + int len = s.Length; + for (int i = 0; i < len; i++) + { + // maximum lookahead: 3 + char ch = s[i]; + char ch2 = (i < len - 1) ? s[i + 1] : (char)0; + char ch3 = (i < len - 2) ? s[i + 2] : (char)0; + + //main: + switch (ch) + { + + case 'ã': + switch (ch2) + { + case 'ã«': + case 'ã': + case 'ã¯': + case 'ã±': + case 'ã³': + builder.Append('k'); + goto break_main; + case 'ãµ': + case 'ã·': + case 'ã¹': + case 'ã»': + case 'ã½': + builder.Append('s'); + goto break_main; + case 'ã¿': + case 'ã': + case 'ã': + case 'ã': + case 'ã': + builder.Append('t'); + goto break_main; + case 'ã': + case 'ã': + case 'ã': + case 'ã': + case 'ã': + builder.Append('p'); + goto break_main; + } + break; + case 'ã¢': + builder.Append('a'); + break; + case 'ã¤': + if (ch2 == 'ã£') + { + builder.Append("yi"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("ye"); + i++; + } + else + { + builder.Append('i'); + } + break; + case 'ã¦': + switch (ch2) + { + case 'ã¡': + builder.Append("wa"); + i++; + break; + case 'ã£': + builder.Append("wi"); + i++; + break; + case 'ã¥': + builder.Append("wu"); + i++; + break; + case 'ã§': + builder.Append("we"); + i++; + break; + case 'ã©': + builder.Append("wo"); + i++; + break; + case 'ã¥': + builder.Append("wyu"); + i++; + break; + default: + builder.Append('u'); + break; + } + break; + case 'ã¨': + builder.Append('e'); + break; + case 'ãª': + if (ch2 == 'ã¦') + { + builder.Append('Å'); + i++; + } + else + { + builder.Append('o'); + } + break; + case 'ã«': + builder.Append("ka"); + break; + case 'ã': + if (ch2 == 'ã§' && ch3 == 'ã¦') + { + builder.Append("kyÅ"); + i += 2; + } + else if (ch2 == 'ã¥' && ch3 == 'ã¦') + { + builder.Append("kyÅ«"); + i += 2; + } + else if (ch2 == 'ã£') + { + builder.Append("kya"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("kyo"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("kyu"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("kye"); + i++; + } + else + { + builder.Append("ki"); + } + break; + case 'ã¯': + switch (ch2) + { + case 'ã¡': + builder.Append("kwa"); + i++; + break; + case 'ã£': + builder.Append("kwi"); + i++; + break; + case 'ã§': + builder.Append("kwe"); + i++; + break; + case 'ã©': + builder.Append("kwo"); + i++; + break; + case 'ã®': + builder.Append("kwa"); + i++; + break; + default: + builder.Append("ku"); + break; + } + break; + case 'ã±': + builder.Append("ke"); + break; + case 'ã³': + if (ch2 == 'ã¦') + { + builder.Append("kÅ"); + i++; + } + else + { + builder.Append("ko"); + } + break; + case 'ãµ': + builder.Append("sa"); + break; + case 'ã·': + if (ch2 == 'ã§' && ch3 == 'ã¦') + { + builder.Append("shÅ"); + i += 2; + } + else if (ch2 == 'ã¥' && ch3 == 'ã¦') + { + builder.Append("shÅ«"); + i += 2; + } + else if (ch2 == 'ã£') + { + builder.Append("sha"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("sho"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("shu"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("she"); + i++; + } + else + { + builder.Append("shi"); + } + break; + case 'ã¹': + if (ch2 == 'ã£') + { + builder.Append("si"); + i++; + } + else + { + builder.Append("su"); + } + break; + case 'ã»': + builder.Append("se"); + break; + case 'ã½': + if (ch2 == 'ã¦') + { + builder.Append("sÅ"); + i++; + } + else + { + builder.Append("so"); + } + break; + case 'ã¿': + builder.Append("ta"); + break; + case 'ã': + if (ch2 == 'ã§' && ch3 == 'ã¦') + { + builder.Append("chÅ"); + i += 2; + } + else if (ch2 == 'ã¥' && ch3 == 'ã¦') + { + builder.Append("chÅ«"); + i += 2; + } + else if (ch2 == 'ã£') + { + builder.Append("cha"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("cho"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("chu"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("che"); + i++; + } + else + { + builder.Append("chi"); + } + break; + case 'ã': + if (ch2 == 'ã¡') + { + builder.Append("tsa"); + i++; + } + else if (ch2 == 'ã£') + { + builder.Append("tsi"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("tse"); + i++; + } + else if (ch2 == 'ã©') + { + builder.Append("tso"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("tsyu"); + i++; + } + else + { + builder.Append("tsu"); + } + break; + case 'ã': + if (ch2 == 'ã£') + { + builder.Append("ti"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("tu"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("tyu"); + i++; + } + else + { + builder.Append("te"); + } + break; + case 'ã': + if (ch2 == 'ã¦') + { + builder.Append("tÅ"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("tu"); + i++; + } + else + { + builder.Append("to"); + } + break; + case 'ã': + builder.Append("na"); + break; + case 'ã': + if (ch2 == 'ã§' && ch3 == 'ã¦') + { + builder.Append("nyÅ"); + i += 2; + } + else if (ch2 == 'ã¥' && ch3 == 'ã¦') + { + builder.Append("nyÅ«"); + i += 2; + } + else if (ch2 == 'ã£') + { + builder.Append("nya"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("nyo"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("nyu"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("nye"); + i++; + } + else + { + builder.Append("ni"); + } + break; + case 'ã': + builder.Append("nu"); + break; + case 'ã': + builder.Append("ne"); + break; + case 'ã': + if (ch2 == 'ã¦') + { + builder.Append("nÅ"); + i++; + } + else + { + builder.Append("no"); + } + break; + case 'ã': + builder.Append("ha"); + break; + case 'ã': + if (ch2 == 'ã§' && ch3 == 'ã¦') + { + builder.Append("hyÅ"); + i += 2; + } + else if (ch2 == 'ã¥' && ch3 == 'ã¦') + { + builder.Append("hyÅ«"); + i += 2; + } + else if (ch2 == 'ã£') + { + builder.Append("hya"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("hyo"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("hyu"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("hye"); + i++; + } + else + { + builder.Append("hi"); + } + break; + case 'ã': + if (ch2 == 'ã£') + { + builder.Append("fya"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("fyu"); + i++; + } + else if (ch2 == 'ã£' && ch3 == 'ã§') + { + builder.Append("fye"); + i += 2; + } + else if (ch2 == 'ã§') + { + builder.Append("fyo"); + i++; + } + else if (ch2 == 'ã¡') + { + builder.Append("fa"); + i++; + } + else if (ch2 == 'ã£') + { + builder.Append("fi"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("fe"); + i++; + } + else if (ch2 == 'ã©') + { + builder.Append("fo"); + i++; + } + else + { + builder.Append("fu"); + } + break; + case 'ã': + builder.Append("he"); + break; + case 'ã': + if (ch2 == 'ã¦') + { + builder.Append("hÅ"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("hu"); + i++; + } + else + { + builder.Append("ho"); + } + break; + case 'ã': + builder.Append("ma"); + break; + case 'ã': + if (ch2 == 'ã§' && ch3 == 'ã¦') + { + builder.Append("myÅ"); + i += 2; + } + else if (ch2 == 'ã¥' && ch3 == 'ã¦') + { + builder.Append("myÅ«"); + i += 2; + } + else if (ch2 == 'ã£') + { + builder.Append("mya"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("myo"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("myu"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("mye"); + i++; + } + else + { + builder.Append("mi"); + } + break; + case 'ã ': + builder.Append("mu"); + break; + case 'ã¡': + builder.Append("me"); + break; + case 'ã¢': + if (ch2 == 'ã¦') + { + builder.Append("mÅ"); + i++; + } + else + { + builder.Append("mo"); + } + break; + case 'ã¤': + builder.Append("ya"); + break; + case 'ã¦': + builder.Append("yu"); + break; + case 'ã¨': + if (ch2 == 'ã¦') + { + builder.Append("yÅ"); + i++; + } + else + { + builder.Append("yo"); + } + break; + case 'ã©': + if (ch2 == 'ã') + { + builder.Append("la"); + i++; + } + else + { + builder.Append("ra"); + } + break; + case 'ãª': + if (ch2 == 'ã§' && ch3 == 'ã¦') + { + builder.Append("ryÅ"); + i += 2; + } + else if (ch2 == 'ã¥' && ch3 == 'ã¦') + { + builder.Append("ryÅ«"); + i += 2; + } + else if (ch2 == 'ã£') + { + builder.Append("rya"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("ryo"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("ryu"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("rye"); + i++; + } + else if (ch2 == 'ã') + { + builder.Append("li"); + i++; + } + else + { + builder.Append("ri"); + } + break; + case 'ã«': + if (ch2 == 'ã') + { + builder.Append("lu"); + i++; + } + else + { + builder.Append("ru"); + } + break; + case 'ã¬': + if (ch2 == 'ã') + { + builder.Append("le"); + i++; + } + else + { + builder.Append("re"); + } + break; + case 'ã': + if (ch2 == 'ã¦') + { + builder.Append("rÅ"); + i++; + } + else if (ch2 == 'ã') + { + builder.Append("lo"); + i++; + } + else + { + builder.Append("ro"); + } + break; + case 'ã¯': + builder.Append("wa"); + break; + case 'ã°': + builder.Append("i"); + break; + case 'ã±': + builder.Append("e"); + break; + case 'ã²': + builder.Append("o"); + break; + case 'ã³': + switch (ch2) + { + case 'ã': + case 'ã': + case 'ã': + case 'ã': + case 'ã': + case 'ã': + case 'ã': + case 'ã': + case 'ã': + case 'ã': + case 'ã': + case 'ã': + case 'ã ': + case 'ã¡': + case 'ã¢': + builder.Append('m'); + goto break_main; + case 'ã¤': + case 'ã¦': + case 'ã¨': + case 'ã¢': + case 'ã¤': + case 'ã¦': + case 'ã¨': + case 'ãª': + builder.Append("n'"); + goto break_main; + default: + builder.Append("n"); + goto break_main; + } + case 'ã¬': + builder.Append("ga"); + break; + case 'ã®': + if (ch2 == 'ã§' && ch3 == 'ã¦') + { + builder.Append("gyÅ"); + i += 2; + } + else if (ch2 == 'ã¥' && ch3 == 'ã¦') + { + builder.Append("gyÅ«"); + i += 2; + } + else if (ch2 == 'ã£') + { + builder.Append("gya"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("gyo"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("gyu"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("gye"); + i++; + } + else + { + builder.Append("gi"); + } + break; + case 'ã°': + switch (ch2) + { + case 'ã¡': + builder.Append("gwa"); + i++; + break; + case 'ã£': + builder.Append("gwi"); + i++; + break; + case 'ã§': + builder.Append("gwe"); + i++; + break; + case 'ã©': + builder.Append("gwo"); + i++; + break; + case 'ã®': + builder.Append("gwa"); + i++; + break; + default: + builder.Append("gu"); + break; + } + break; + case 'ã²': + builder.Append("ge"); + break; + case 'ã´': + if (ch2 == 'ã¦') + { + builder.Append("gÅ"); + i++; + } + else + { + builder.Append("go"); + } + break; + case 'ã¶': + builder.Append("za"); + break; + case 'ã¸': + if (ch2 == 'ã§' && ch3 == 'ã¦') + { + builder.Append("jÅ"); + i += 2; + } + else if (ch2 == 'ã¥' && ch3 == 'ã¦') + { + builder.Append("jÅ«"); + i += 2; + } + else if (ch2 == 'ã£') + { + builder.Append("ja"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("jo"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("ju"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("je"); + i++; + } + else + { + builder.Append("ji"); + } + break; + case 'ãº': + if (ch2 == 'ã£') + { + builder.Append("zi"); + i++; + } + else + { + builder.Append("zu"); + } + break; + case 'ã¼': + builder.Append("ze"); + break; + case 'ã¾': + if (ch2 == 'ã¦') + { + builder.Append("zÅ"); + i++; + } + else + { + builder.Append("zo"); + } + break; + case 'ã': + builder.Append("da"); + break; + case 'ã': + // TODO: investigate all this + if (ch2 == 'ã§' && ch3 == 'ã¦') + { + builder.Append("jÅ"); + i += 2; + } + else if (ch2 == 'ã¥' && ch3 == 'ã¦') + { + builder.Append("jÅ«"); + i += 2; + } + else if (ch2 == 'ã£') + { + builder.Append("ja"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("jo"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("ju"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("je"); + i++; + } + else + { + builder.Append("ji"); + } + break; + case 'ã ': + builder.Append("zu"); + break; + case 'ã': + if (ch2 == 'ã£') + { + builder.Append("di"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("dyu"); + i++; + } + else + { + builder.Append("de"); + } + break; + case 'ã': + if (ch2 == 'ã¦') + { + builder.Append("dÅ"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("du"); + i++; + } + else + { + builder.Append("do"); + } + break; + case 'ã': + builder.Append("ba"); + break; + case 'ã': + if (ch2 == 'ã§' && ch3 == 'ã¦') + { + builder.Append("byÅ"); + i += 2; + } + else if (ch2 == 'ã¥' && ch3 == 'ã¦') + { + builder.Append("byÅ«"); + i += 2; + } + else if (ch2 == 'ã£') + { + builder.Append("bya"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("byo"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("byu"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("bye"); + i++; + } + else + { + builder.Append("bi"); + } + break; + case 'ã': + builder.Append("bu"); + break; + case 'ã': + builder.Append("be"); + break; + case 'ã': + if (ch2 == 'ã¦') + { + builder.Append("bÅ"); + i++; + } + else + { + builder.Append("bo"); + } + break; + case 'ã': + builder.Append("pa"); + break; + case 'ã': + if (ch2 == 'ã§' && ch3 == 'ã¦') + { + builder.Append("pyÅ"); + i += 2; + } + else if (ch2 == 'ã¥' && ch3 == 'ã¦') + { + builder.Append("pyÅ«"); + i += 2; + } + else if (ch2 == 'ã£') + { + builder.Append("pya"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("pyo"); + i++; + } + else if (ch2 == 'ã¥') + { + builder.Append("pyu"); + i++; + } + else if (ch2 == 'ã§') + { + builder.Append("pye"); + i++; + } + else + { + builder.Append("pi"); + } + break; + case 'ã': + builder.Append("pu"); + break; + case 'ã': + builder.Append("pe"); + break; + case 'ã': + if (ch2 == 'ã¦') + { + builder.Append("pÅ"); + i++; + } + else + { + builder.Append("po"); + } + break; + case 'ã·': + builder.Append("va"); + break; + case 'ã¸': + builder.Append("vi"); + break; + case 'ã¹': + builder.Append("ve"); + break; + case 'ãº': + builder.Append("vo"); + break; + case 'ã´': + if (ch2 == 'ã£' && ch3 == 'ã§') + { + builder.Append("vye"); + i += 2; + } + else + { + builder.Append('v'); + } + break; + case 'ã¡': + builder.Append('a'); + break; + case 'ã£': + builder.Append('i'); + break; + case 'ã¥': + builder.Append('u'); + break; + case 'ã§': + builder.Append('e'); + break; + case 'ã©': + builder.Append('o'); + break; + case 'ã®': + builder.Append("wa"); + break; + case 'ã£': + builder.Append("ya"); + break; + case 'ã¥': + builder.Append("yu"); + break; + case 'ã§': + builder.Append("yo"); + break; + case 'ã¼': + break; + default: + builder.Append(ch); + break; + } + break_main: { } + } + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/project.json b/src/Lucene.Net.Analysis.Kuromoji/project.json new file mode 100644 index 0000000..937b9bf --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/project.json @@ -0,0 +1,60 @@ +{ + "version": "4.8.0", + "title": "Lucene.Net.Analysis.Kuromoji", + "description": "Japanese Morphological Analyzer for the Lucene.Net full-text search engine library from The Apache Software Foundation.", + "authors": [ "The Apache Software Foundation" ], + "packOptions": { + "projectUrl": "http://lucenenet.apache.org/", + "licenseUrl": "https://github.com/apache/lucenenet/blob/master/LICENSE.txt", + "iconUrl": "https://github.com/apache/lucenenet/blob/master/branding/logo/lucene-net-icon-128x128.png?raw=true", + "owners": [ "The Apache Software Foundation" ], + "repository": { "url": "https://github.com/apache/lucenenet" }, + "tags": [ "lucene.net", "core", "text", "search", "information", "retrieval", "lucene", "apache", "analysis", "index", "query", "japanese" ], + "releaseNotes": "This package depends on a temporary version of icu.net hosted on MyGet until official .NET Core support is added. To install, copy the NuGet.config file from https://github.com/apache/lucenenet/blob/master/NuGet.config into your project and then install this package via Package Manager Console as usual." + }, + "buildOptions": { + "compile": { + "includeFiles": [ "../CommonAssemblyInfo.cs" ] + }, + "embed": { + "includeFiles": [ + "stoptags.txt", + "stopwords.txt", + "Dict/CharacterDefinition.dat", + "Dict/ConnectionCosts.dat", + "Dict/TokenInfoDictionary$buffer.dat", + "Dict/TokenInfoDictionary$fst.dat", + "Dict/TokenInfoDictionary$posDict.dat", + "Dict/TokenInfoDictionary$targetMap.dat", + "Dict/UnknownDictionary$buffer.dat", + "Dict/UnknownDictionary$posDict.dat", + "Dict/UnknownDictionary$targetMap.dat" + ] + }, + "nowarn": [ "1591", "1573" ] + }, + "dependencies": { + "Lucene.Net": "4.8.0", + "Lucene.Net.Analysis.Common": "4.8.0" + }, + "frameworks": { + "netstandard1.5": { + "imports": "dnxcore50", + "buildOptions": { + "debugType": "portable", + "define": [ "NETSTANDARD" ] + }, + "dependencies": { + "NETStandard.Library": "1.6.0", + "System.Globalization.Extensions": "4.3.0", + "System.Text.Encoding.CodePages": "4.4.0-preview1-25305-02" + } + }, + "net451": { + "buildOptions": { + "debugType": "full", + "define": [ "FEATURE_SERIALIZABLE" ] + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/stoptags.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/stoptags.txt b/src/Lucene.Net.Analysis.Kuromoji/stoptags.txt new file mode 100644 index 0000000..71b7508 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/stoptags.txt @@ -0,0 +1,420 @@ +# +# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter. +# +# Any token with a part-of-speech tag that exactly matches those defined in this +# file are removed from the token stream. +# +# Set your own stoptags by uncommenting the lines below. Note that comments are +# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists, +# etc. that can be useful for building you own stoptag set. +# +# The entire possible tagset is provided below for convenience. +# +##### +# noun: unclassified nouns +#åè© +# +# noun-common: Common nouns or nouns where the sub-classification is undefined +#åè©-ä¸è¬ +# +# noun-proper: Proper nouns where the sub-classification is undefined +#åè©-åºæåè© +# +# noun-proper-misc: miscellaneous proper nouns +#åè©-åºæåè©-ä¸è¬ +# +# noun-proper-person: Personal names where the sub-classification is undefined +#åè©-åºæåè©-人å +# +# noun-proper-person-misc: names that cannot be divided into surname and +# given name; foreign names; names where the surname or given name is unknown. +# e.g. ãå¸ã®æ¹ +#åè©-åºæåè©-人å-ä¸è¬ +# +# noun-proper-person-surname: Mainly Japanese surnames. +# e.g. å±±ç° +#åè©-åºæåè©-人å-å§ +# +# noun-proper-person-given_name: Mainly Japanese given names. +# e.g. 太é +#åè©-åºæåè©-人å-å +# +# noun-proper-organization: Names representing organizations. +# e.g. éç£ç, NHK +#åè©-åºæåè©-çµç¹ +# +# noun-proper-place: Place names where the sub-classification is undefined +#åè©-åºæåè©-å°å +# +# noun-proper-place-misc: Place names excluding countries. +# e.g. ã¢ã¸ã¢, ãã«ã»ãã, äº¬é½ +#åè©-åºæåè©-å°å-ä¸è¬ +# +# noun-proper-place-country: Country names. +# e.g. æ¥æ¬, ãªã¼ã¹ãã©ãªã¢ +#åè©-åºæåè©-å°å-å½ +# +# noun-pronoun: Pronouns where the sub-classification is undefined +#åè©-代åè© +# +# noun-pronoun-misc: miscellaneous pronouns: +# e.g. ãã, ãã, ããã¤, ããªã, ãã¡ãã¡, ããã¤, ã©ãã, ãªã«, ã¿ãªãã, ã¿ããª, ãããã, ãããã +#åè©-代åè©-ä¸è¬ +# +# noun-pronoun-contraction: Spoken language contraction made by combining a +# pronoun and the particle 'wa'. +# e.g. ããã, ããã, ãããã, ããã, ãããã +#åè©-代åè©-ç¸®ç´ +# +# noun-adverbial: Temporal nouns such as names of days or months that behave +# like adverbs. Nouns that represent amount or ratios and can be used adverbially, +# e.g. éæ, 䏿, åå¾, å°é +#åè©-å¯è©å¯è½ +# +# noun-verbal: Nouns that take arguments with case and can appear followed by +# 'suru' and related verbs (ãã, ã§ãã, ãªãã, ãã ãã) +# e.g. ã¤ã³ããã, æç, æªå, æªæ¦è¦é, ä¸å®å¿, ä¸åã +#åè©-ãµå¤æ¥ç¶ +# +# noun-adjective-base: The base form of adjectives, words that appear before 㪠("na") +# e.g. å¥åº·, 宿, é§ç®, ã ã +#åè©-形容åè©èªå¹¹ +# +# noun-numeric: Arabic numbers, Chinese numerals, and counters like ä½ (å), æ°. +# e.g. 0, 1, 2, ä½, æ°, å¹¾ +#åè©-æ° +# +# noun-affix: noun affixes where the sub-classification is undefined +#åè©-éèªç« +# +# noun-affix-misc: Of adnominalizers, the case-marker ã® ("no"), and words that +# attach to the base form of inflectional words, words that cannot be classified +# into any of the other categories below. This category includes indefinite nouns. +# e.g. ããã¤ã, æ, ãã, ç²æ, æ°, ããã, å«ã, ãã, ç, ãã¨, äº, ãã¨, æ¯, ãã ã, 次第, +# é , ãã, æçº, ã¤ãã§, åºã§, ã¤ãã, ç©ãã, ç¹, ã©ãã, ã®, ã¯ã, ç, ã¯ãã¿, å¼¾ã¿, +# æå, ãµã, ãµã, æ¯ã, ã»ã, æ¹, æ¨, ãã®, ç©, è , ãã, æ , ããã, æä»¥, ãã, 訳, +# ãã, å²ã, å², ã-å£èª/, ãã-å£èª/ +#åè©-éèªç«-ä¸è¬ +# +# noun-affix-adverbial: noun affixes that that can behave as adverbs. +# e.g. ããã , é, ããã, æãå¥, ãã¨, å¾, ä½ã, 以å¤, 以é, 以å¾, 以ä¸, 以å, 䏿¹, ãã, +# ä¸, ãã¡, å , ãã, æã, ããã, éã, ãã, ã£ãã, çµæ, ãã, é , ãã, é, æä¸, ããªã, +# æä¸, ããã, èªä½, ãã³, 度, ãã, çº, ã¤ã©, é½åº¦, ã¨ãã, éã, ã¨ã, æ, ã¨ãã, æ, +# ã¨ãã, é端, ãªã, ä¸, ã®ã¡, å¾, ã°ãã, å ´å, æ¥, ã¶ã, å, ã»ã, ä», ã¾ã, å, ã¾ã¾, +# å, ä¾, ã¿ãã, ç¢å +#åè©-éèªç«-å¯è©å¯è½ +# +# noun-affix-aux: noun affixes treated as å©åè© ("auxiliary verb") in school grammars +# with the stem ãã(ã ) ("you(da)"). +# e.g. ãã, ãã, æ§ (ãã) +#åè©-éèªç«-å©åè©èªå¹¹ +# +# noun-affix-adjective-base: noun affixes that can connect to the indeclinable +# connection form 㪠(aux "da"). +# e.g. ã¿ãã, ãµã +#åè©-éèªç«-形容åè©èªå¹¹ +# +# noun-special: special nouns where the sub-classification is undefined. +#åè©-ç¹æ® +# +# noun-special-aux: The ããã ("souda") stem form that is used for reporting news, is +# treated as å©åè© ("auxiliary verb") in school grammars, and attach to the base +# form of inflectional words. +# e.g. ãã +#åè©-ç¹æ®-å©åè©èªå¹¹ +# +# noun-suffix: noun suffixes where the sub-classification is undefined. +#åè©-æ¥å°¾ +# +# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect +# to ã¬ã« or ã¿ã¤ and can combine into compound nouns, words that cannot be classified into +# any of the other categories below. In general, this category is more inclusive than +# æ¥å°¾èª ("suffix") and is usually the last element in a compound noun. +# e.g. ãã, ãã, æ¹, ç²æ (ãã), ããã, ãã¿, æ°å³, ããã¿, (ï½ãã) ã, 次第, æ¸ (ã) ã¿, +# ãã, (ã§ã)ã£ã, æ, 観, æ§, å¦, é¡, é¢, ç¨ +#åè©-æ¥å°¾-ä¸è¬ +# +# noun-suffix-person: Suffixes that form nouns and attach to person names more often +# than other nouns. +# e.g. å, æ§, è +#åè©-æ¥å°¾-人å +# +# noun-suffix-place: Suffixes that form nouns and attach to place names more often +# than other nouns. +# e.g. çº, å¸, ç +#åè©-æ¥å°¾-å°å +# +# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that +# can appear before ã¹ã« ("suru"). +# e.g. å, è¦, åã, å ¥ã, è½ã¡, è²·ã +#åè©-æ¥å°¾-ãµå¤æ¥ç¶ +# +# noun-suffix-aux: The stem form of ããã (æ§æ ) that is used to indicate conditions, +# is treated as å©åè© ("auxiliary verb") in school grammars, and attach to the +# conjunctive form of inflectional words. +# e.g. ãã +#åè©-æ¥å°¾-å©åè©èªå¹¹ +# +# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive +# form of inflectional words and appear before the copula ã ("da"). +# e.g. ç, ã, ãã¡ +#åè©-æ¥å°¾-形容åè©èªå¹¹ +# +# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs. +# e.g. å¾ (ã), 以å¾, 以é, 以å, åå¾, ä¸, æ«, ä¸, æ (ã) +#åè©-æ¥å°¾-å¯è©å¯è½ +# +# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category +# is more inclusive than 婿°è© ("classifier") and includes common nouns that attach +# to numbers. +# e.g. å, ã¤, æ¬, å, ãã¼ã»ã³ã, cm, kg, ã«æ, ãå½, åºç», æé, æå +#åè©-æ¥å°¾-婿°è© +# +# noun-suffix-special: Special suffixes that mainly attach to inflecting words. +# e.g. (楽ã) ã, (èã) æ¹ +#åè©-æ¥å°¾-ç¹æ® +# +# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words +# together. +# e.g. (æ¥æ¬) 対 (ã¢ã¡ãªã«), 対 (ã¢ã¡ãªã«), (3) 対 (5), (女åª) å ¼ (主婦) +#åè©-æ¥ç¶è©ç +# +# noun-verbal_aux: Nouns that attach to the conjunctive particle 㦠("te") and are +# semantically verb-like. +# e.g. ããã, ã覧, 御覧, é æ´ +#åè©-åè©éèªç«ç +# +# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, +# dialects, English, etc. Currently, the only entry for åè© å¼ç¨æåå ("noun quotation") +# is ããã ("iwaku"). +#åè©-å¼ç¨æåå +# +# noun-nai_adjective: Words that appear before the auxiliary verb ãªã ("nai") and +# behave like an adjective. +# e.g. ç³ã訳, 仿¹, ã¨ãã§ã, éã +#åè©-ãã¤å½¢å®¹è©èªå¹¹ +# +##### +# prefix: unclassified prefixes +#æ¥é è© +# +# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) +# excluding numerical expressions. +# e.g. ã (æ°´), æ (æ°), å (社), æ (ï½æ°), é« (å質), ã (è¦äº), ã (ç«æ´¾) +#æ¥é è©-åè©æ¥ç¶ +# +# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb +# in conjunctive form followed by ãªã/ãªãã/ãã ãã. +# e.g. ã (èªã¿ãªãã), ã (座ã) +#æ¥é è©-åè©æ¥ç¶ +# +# prefix-adjectival: Prefixes that attach to adjectives. +# e.g. ã (å¯ãã§ããã), ãã« (ã§ãã) +#æ¥é è©-å½¢å®¹è©æ¥ç¶ +# +# prefix-numerical: Prefixes that attach to numerical expressions. +# e.g. ç´, ããã, æ¯æ +#æ¥é è©-æ°æ¥ç¶ +# +##### +# verb: unclassified verbs +#åè© +# +# verb-main: +#åè©-èªç« +# +# verb-auxiliary: +#åè©-éèªç« +# +# verb-suffix: +#åè©-æ¥å°¾ +# +##### +# adjective: unclassified adjectives +#å½¢å®¹è© +# +# adjective-main: +#形容è©-èªç« +# +# adjective-auxiliary: +#形容è©-éèªç« +# +# adjective-suffix: +#形容è©-æ¥å°¾ +# +##### +# adverb: unclassified adverbs +#å¯è© +# +# adverb-misc: Words that can be segmented into one unit and where adnominal +# modification is not possible. +# e.g. ãããããã, å¤å +#å¯è©-ä¸è¬ +# +# adverb-particle_conjunction: Adverbs that can be followed by ã®, ã¯, ã«, +# ãª, ãã, ã , etc. +# e.g. ãããªã«, ãããªã«, ãããªã«, ãªã«ã, ãªãã§ã +#å¯è©-å©è©é¡æ¥ç¶ +# +##### +# adnominal: Words that only have noun-modifying forms. +# e.g. ãã®, ãã®, ãã®, ã©ã®, ãããã, ãªãããã®, ä½ããã®, ããããª, ãããã, ãããã, ãããã, +# ã©ããã, ãããª, ãããª, ãããª, ã©ããª, 大ããª, å°ããª, ããããª, ã»ãã®, ãããã, +# ã(, ã) ãã (ãã¨ãªãã)ã, å¾®ã ãã, å ã ãã, åãªã, ãããªã, æãããåã, 亡ã +#é£ä½è© +# +##### +# conjunction: Conjunctions that can occur independently. +# e.g. ã, ããã©ã, ããã¦, ããã, ããã©ããã +æ¥ç¶è© +# +##### +# particle: unclassified particles. +å©è© +# +# particle-case: case particles where the subclassification is undefined. +å©è©-æ ¼å©è© +# +# particle-case-misc: Case particles. +# e.g. ãã, ã, ã§, ã¨, ã«, ã¸, ãã, ã, ã®, ã«ã¦ +å©è©-æ ¼å©è©-ä¸è¬ +# +# particle-case-quote: the "to" that appears after nouns, a personâs speech, +# quotation marks, expressions of decisions from a meeting, reasons, judgements, +# conjectures, etc. +# e.g. ( ã ) 㨠(è¿°ã¹ã.), ( ã§ãã) 㨠(ãã¦å·è¡ç¶äº...) +å©è©-æ ¼å©è©-å¼ç¨ +# +# particle-case-compound: Compounds of particles and verbs that mainly behave +# like case particles. +# e.g. ã¨ãã, ã¨ãã£ã, ã¨ããã, ã¨ãã¦, ã¨ã¨ãã«, ã¨å ±ã«, ã§ãã£ã¦, ã«ããã£ã¦, ã«å½ãã£ã¦, ã«å½ã£ã¦, +# ã«ããã, ã«å½ãã, ã«å½ã, ã«å½ãã, ã«ããã, ã«ããã¦, ã«æ¼ãã¦,ã«æ¼ã¦, ã«ããã, ã«æ¼ãã, +# ã«ãã, ã«ããã¦, ã«ããã, ã«é¢ã, ã«ãããã¦, ã«é¢ãã¦, ã«ãããã, ã«é¢ãã, ã«éã, +# ã«éãã¦, ã«ãããã, ã«å¾ã, ã«å¾ã, ã«ãããã£ã¦, ã«å¾ã£ã¦, ã«ããã, ã«å¯¾ã, ã«ãããã¦, +# ã«å¯¾ãã¦, ã«ãããã, ã«å¯¾ãã, ã«ã¤ãã¦, ã«ã¤ã, ã«ã¤ã, ã«ã¤ãã¦, ã«ã¤ã, ã«ã¤ãã¦, ã«ã¨ã£ã¦, +# ã«ã¨ã, ã«ã¾ã¤ãã, ã«ãã£ã¦, ã«ä¾ã£ã¦, ã«å ã£ã¦, ã«ãã, ã«ä¾ã, ã«å ã, ã«ãã, ã«ä¾ã, ã«å ã, +# ã«ããã£ã¦, ã«ããã, ããã£ã¦, ã以ã£ã¦, ãéã, ãéãã¦, ãéãã¦, ãããã£ã¦, ãããã, ãããã, +# ã£ã¦-å£èª/, ã¡ã ã-é¢è¥¿å¼ãã¨ããã/, (ä½) ã¦ãã (人)-å£èª/, ã£ã¦ãã-å£èª/, ã¨ããµ, ã¨ãããµ +å©è©-æ ¼å©è©-é£èª +# +# particle-conjunctive: +# e.g. ãã, ããã«ã¯, ã, ããã©, ããã©ã, ãã©, ã, ã¤ã¤, ã¦, ã§, ã¨, ã¨ããã, ã©ããã, ã¨ã, ã©ã, +# ãªãã, ãªã, ã®ã§, ã®ã«, ã°, ãã®ã®, ã ( ãã), ãããªã, (ããã) ãã(ãããªã)-å£èª/, +# (è¡ã£) ã¡ã(ãããªã)-å£èª/, (è¨ã£) ãã£ã¦ (ãããããªã)-å£èª/, (ããããªã)ã£ãã£ã¦ (å¹³æ°)-å£èª/ +å©è©-æ¥ç¶å©è© +# +# particle-dependency: +# e.g. ãã, ãã, ãã, ãã, ã¯, ã, ã +å©è©-ä¿å©è© +# +# particle-adverbial: +# e.g. ãã¦ã, ãã, ããã, ä½, ããã, ãã, (妿 ¡) ãã(ãããæµè¡ã£ã¦ãã)-å£èª/, +# (ãã)ããã (ãããªã)-å£èª/, ãã¤, (ç§) ãªã, ãªã©, (ç§) ãªã (ã«), (å ç) ãªãã (大å«ã)-å£èª/, +# (ç§) ãªãã, (å ç) ãªã㦠(大å«ã)-å£èª/, ã®ã¿, ã ã, (ç§) ã ã£ã¦-å£èª/, ã ã«, +# (å½¼)ã£ãã-å£èª/, (ãè¶) ã§ã (ããã), ç (ã¨ã), (ä»å¾) ã¨ã, ã°ãã, ã°ã£ã-å£èª/, ã°ã£ãã-å£èª/, +# ã»ã©, ç¨, ã¾ã§, è¿, (誰) ã (ã)([å©è©-æ ¼å©è©] ããã³ [å©è©-ä¿å©è©] ã®åã«ä½ç½®ããããã) +å©è©-å¯å©è© +# +# particle-interjective: particles with interjective grammatical roles. +# e.g. (æ¾å³¶) ã +å©è©-éæå©è© +# +# particle-coordinate: +# e.g. ã¨, ãã, ã ã®, ã ã, ã¨ã, ãªã, ã, ãã +å©è©-並ç«å©è© +# +# particle-final: +# e.g. ãã, ããã, ã, ã, (ã )ã£ã-å£èª/, (ã¨ã¾ã£ã¦ã) ã§-æ¹è¨/, ãª, ã, ãªã-å£èª/, ã, ã, ã, +# ãã-å£èª/, ãã-å£èª/, ãã-æ¹è¨/, ã®, ã®ã-å£èª/, ã, ã, ã¨, ãã-å£èª/, ã, ãã-å£èª/ +å©è©-çµå©è© +# +# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is +# adverbial, conjunctive, or sentence final. For example: +# (a) ãA ã B ãã. Ex:ã(å½å ã§éç¨ãã) ã,(æµ·å¤ã§éç¨ãã) ã (.)ã +# (b) Inside an adverb phrase. Ex:ã(幸ãã¨ãã) ã (, æ»è ã¯ããªãã£ã.)ã +# ã(ç¥ããå±ãããã) ã (, 試é¨ã«åæ ¼ãã.)ã +# (c) ããã®ããã«ã. Ex:ã(ä½ããªãã£ã) ã (ã®ããã«æ¯ãèã£ã.)ã +# e.g. ã +å©è©-å¯å©è©ï¼ä¸¦ç«å©è©ï¼çµå©è© +# +# particle-adnominalizer: The "no" that attaches to nouns and modifies +# non-inflectional words. +å©è©-é£ä½å +# +# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs +# that are giongo, giseigo, or gitaigo. +# e.g. ã«, 㨠+å©è©-å¯è©å +# +# particle-special: A particle that does not fit into one of the above classifications. +# This includes particles that are used in Tanka, Haiku, and other poetry. +# e.g. ããª, ãã, ( ããã ãã) ã«, (ããã) ã«ã(ãããã), (俺) ã (å®¶) +å©è©-ç¹æ® +# +##### +# auxiliary-verb: +å©åè© +# +##### +# interjection: Greetings and other exclamations. +# e.g. ãã¯ãã, ãã¯ãããããã¾ã, ããã«ã¡ã¯, ããã°ãã¯, ãããã¨ã, ã©ãããããã¨ã, ãããã¨ããããã¾ã, +# ããã ãã¾ã, ãã¡ãããã¾, ãããªã, ããããªã, ã¯ã, ããã, ããã, ããããªãã +#æåè© +# +##### +# symbol: unclassified Symbols. +è¨å· +# +# symbol-misc: A general symbol not in one of the categories below. +# e.g. [ââ@$ãâ+] +è¨å·-ä¸è¬ +# +# symbol-comma: Commas +# e.g. [,ã] +è¨å·-èªç¹ +# +# symbol-period: Periods and full stops. +# e.g. [.ï¼ã] +è¨å·-å¥ç¹ +# +# symbol-space: Full-width whitespace. +è¨å·-ç©ºç½ +# +# symbol-open_bracket: +# e.g. [({ââãã] +è¨å·-æ¬å¼§é +# +# symbol-close_bracket: +# e.g. [)}ââããã] +è¨å·-æ¬å¼§é +# +# symbol-alphabetic: +#è¨å·-ã¢ã«ãã¡ããã +# +##### +# other: unclassified other +#ãã®ä» +# +# other-interjection: Words that are hard to classify as noun-suffixes or +# sentence-final particles. +# e.g. (ã )ã¡ +ãã®ä»-éæ +# +##### +# filler: Aizuchi that occurs during a conversation or sounds inserted as filler. +# e.g. ãã®, ããã¨, ã㨠+ãã£ã©ã¼ +# +##### +# non-verbal: non-verbal sound. +éè¨èªé³ +# +##### +# fragment: +#èªæç +# +##### +# unknown: unknown part of speech. +#æªç¥èª +# +##### End of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/stopwords.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/stopwords.txt b/src/Lucene.Net.Analysis.Kuromoji/stopwords.txt new file mode 100644 index 0000000..d4321be --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/stopwords.txt @@ -0,0 +1,127 @@ +# +# This file defines a stopword set for Japanese. +# +# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. +# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 +# for frequency lists, etc. that can be useful for making your own set (if desired) +# +# Note that there is an overlap between these stopwords and the terms stopped when used +# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note +# that comments are not allowed on the same line as stopwords. +# +# Also note that stopping is done in a case-insensitive manner. Change your StopFilter +# configuration if you need case-sensitive stopping. Lastly, note that stopping is done +# using the same character width as the entries in this file. Since this StopFilter is +# normally done after a CJKWidthFilter in your chain, you would usually want your romaji +# entries to be in half-width and your kana entries to be in full-width. +# +ã® +ã« +㯠+ã +ã +ã +ã§ +㦠+㨠+ã +ã +ã +ãã +ãã +ã +ãã +ãã +㪠+ã㨠+ã¨ã㦠+ã +ã +ãã +ãªã© +ãªã£ +ãªã +ãã® +ãã +ãã® +ã㣠+ãã +ã¾ã +ãã® +ã¨ãã +ãã +ã¾ã§ +ãã +ãªã +㸠+ã +ã +ãã +ã«ãã£ã¦ +ã«ãã +ãã +ãã +ã«ãã +ã +ãªã +ããã +ã«ãã㦠+ã° +ãªã㣠+ãªã +ããã +ã«ã¤ã㦠+ã +ã 㣠+ãã®å¾ +ã§ãã +ãã +ã +ã®ã§ +ãªã +ã®ã¿ +ã§ã +ã +㤠+ã«ããã +ããã³ +ãã +ããã« +ã§ã +ã +ãã +ãã®ä» +ã«é¢ãã +ãã¡ +ã¾ã +ã +ãªã +ã«å¯¾ã㦠+ç¹ã« +ãã +åã³ +ããã +ã¨ã +ã§ã¯ +ã«ã¦ +ã»ã +ãªãã +ãã¡ +ãã㦠+ã¨ã¨ãã« +ãã ã +ãã¤ã¦ +ãããã +ã¾ã㯠+ã +ã»ã© +ãã®ã® +ã«å¯¾ãã +ã»ã¨ãã© +ã¨å ±ã« +ã¨ãã£ã +ã§ã +ã¨ã +ã¨ãã +ãã +##### End of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/TestTokenInfoDictionary.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/TestTokenInfoDictionary.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/TestTokenInfoDictionary.cs new file mode 100644 index 0000000..dd305a4 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/TestTokenInfoDictionary.cs @@ -0,0 +1,114 @@ +using Lucene.Net.Analysis.Ja.Util; +using Lucene.Net.Util; +using Lucene.Net.Util.Fst; +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Ja.Dict +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class TestTokenInfoDictionary : LuceneTestCase + { + /// <summary>enumerates the entire FST/lookup data and just does basic sanity checks</summary> + [Test] + public void TestEnumerateAll() + { + // just for debugging + int numTerms = 0; + int numWords = 0; + int lastWordId = -1; + int lastSourceId = -1; + TokenInfoDictionary tid = TokenInfoDictionary.GetInstance(); + ConnectionCosts matrix = ConnectionCosts.GetInstance(); + FST<long?> fst = tid.FST.InternalFST; + Int32sRefFSTEnum<long?> fstEnum = new Int32sRefFSTEnum<long?>(fst); + Int32sRefFSTEnum.InputOutput<long?> mapping; + Int32sRef scratch = new Int32sRef(); + while ((mapping = fstEnum.Next()) != null) + { + numTerms++; + Int32sRef input = mapping.Input; + char[] chars = new char[input.Length]; + for (int i = 0; i < chars.Length; i++) + { + chars[i] = (char)input.Int32s[input.Offset + i]; + } + assertTrue(UnicodeUtil.ValidUTF16String(new string(chars))); + + long? output = mapping.Output; + int sourceId = (int)output.Value; + // we walk in order, terms, sourceIds, and wordIds should always be increasing + assertTrue(sourceId > lastSourceId); + lastSourceId = sourceId; + tid.LookupWordIds(sourceId, scratch); + for (int i = 0; i < scratch.Length; i++) + { + numWords++; + int wordId = scratch.Int32s[scratch.Offset + i]; + assertTrue(wordId > lastWordId); + lastWordId = wordId; + + String baseForm = tid.GetBaseForm(wordId, chars, 0, chars.Length); + assertTrue(baseForm == null || UnicodeUtil.ValidUTF16String(baseForm)); + + String inflectionForm = tid.GetInflectionForm(wordId); + assertTrue(inflectionForm == null || UnicodeUtil.ValidUTF16String(inflectionForm)); + if (inflectionForm != null) + { + // check that its actually an ipadic inflection form + assertNotNull(ToStringUtil.GetInflectedFormTranslation(inflectionForm)); + } + + String inflectionType = tid.GetInflectionType(wordId); + assertTrue(inflectionType == null || UnicodeUtil.ValidUTF16String(inflectionType)); + if (inflectionType != null) + { + // check that its actually an ipadic inflection type + assertNotNull(ToStringUtil.GetInflectionTypeTranslation(inflectionType)); + } + + int leftId = tid.GetLeftId(wordId); + int rightId = tid.GetRightId(wordId); + + matrix.Get(rightId, leftId); + + tid.GetWordCost(wordId); + + String pos = tid.GetPartOfSpeech(wordId); + assertNotNull(pos); + assertTrue(UnicodeUtil.ValidUTF16String(pos)); + // check that its actually an ipadic pos tag + assertNotNull(ToStringUtil.GetPOSTranslation(pos)); + + String pronunciation = tid.GetPronunciation(wordId, chars, 0, chars.Length); + assertNotNull(pronunciation); + assertTrue(UnicodeUtil.ValidUTF16String(pronunciation)); + + String reading = tid.GetReading(wordId, chars, 0, chars.Length); + assertNotNull(reading); + assertTrue(UnicodeUtil.ValidUTF16String(reading)); + } + } + if (VERBOSE) + { + Console.WriteLine("checked " + numTerms + " terms, " + numWords + " words."); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/UserDictionaryTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/UserDictionaryTest.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/UserDictionaryTest.cs new file mode 100644 index 0000000..f899476 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/Dict/UserDictionaryTest.cs @@ -0,0 +1,90 @@ +using Lucene.Net.Util; +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Ja.Dict +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class UserDictionaryTest : LuceneTestCase + { + [Test] + public void TestLookup() + { + UserDictionary dictionary = TestJapaneseTokenizer.ReadDict(); + String s = "é¢è¥¿å½é空港ã«è¡ã£ã"; + int[][] dictionaryEntryResult = dictionary.Lookup(s.toCharArray(), 0, s.Length); + // Length should be three é¢è¥¿, å½é, 空港 + assertEquals(3, dictionaryEntryResult.Length); + + // Test positions + assertEquals(0, dictionaryEntryResult[0][1]); // index of é¢è¥¿ + assertEquals(2, dictionaryEntryResult[1][1]); // index of å½é + assertEquals(4, dictionaryEntryResult[2][1]); // index of 空港 + + // Test lengths + assertEquals(2, dictionaryEntryResult[0][2]); // length of é¢è¥¿ + assertEquals(2, dictionaryEntryResult[1][2]); // length of å½é + assertEquals(2, dictionaryEntryResult[2][2]); // length of 空港 + + s = "é¢è¥¿å½é空港ã¨é¢è¥¿å½é空港ã«è¡ã£ã"; + int[][] dictionaryEntryResult2 = dictionary.Lookup(s.toCharArray(), 0, s.Length); + // Length should be six + assertEquals(6, dictionaryEntryResult2.Length); + } + + [Test] + public void TestReadings() + { + UserDictionary dictionary = TestJapaneseTokenizer.ReadDict(); + int[] + [] + result = dictionary.Lookup("æ¥æ¬çµæ¸æ°è".toCharArray(), 0, 6); + assertEquals(3, result.Length); + int wordIdNihon = result[0] + [0]; // wordId of æ¥æ¬ in æ¥æ¬çµæ¸æ°è + assertEquals("ããã³", dictionary.GetReading(wordIdNihon, "æ¥æ¬".toCharArray(), 0, 2)); + + result = dictionary.Lookup("æéé¾".toCharArray(), 0, 3); + assertEquals(1, result.Length); + int wordIdAsashoryu = result[0] + [0]; // wordId for æéé¾ + assertEquals("ã¢ãµã·ã§ã¦ãªã¥ã¦", dictionary.GetReading(wordIdAsashoryu, "æéé¾".toCharArray(), 0, 3)); + } + + [Test] + public void TestPartOfSpeech() + { + UserDictionary dictionary = TestJapaneseTokenizer.ReadDict(); + int[] + [] + result = dictionary.Lookup("æ¥æ¬çµæ¸æ°è".toCharArray(), 0, 6); + assertEquals(3, result.Length); + int wordIdKeizai = result[1] + [0]; // wordId of çµæ¸ in æ¥æ¬çµæ¸æ°è + assertEquals("ã«ã¹ã¿ã åè©", dictionary.GetPartOfSpeech(wordIdKeizai)); + } + + [Test] + public void TestRead() + { + UserDictionary dictionary = TestJapaneseTokenizer.ReadDict(); + assertNotNull(dictionary); + } + } +}
