Repository: lucenenet Updated Branches: refs/heads/master e67244aa2 -> 2d5108ba0
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/project.json b/src/Lucene.Net.Tests.Analysis.Kuromoji/project.json new file mode 100644 index 0000000..5badefa --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/project.json @@ -0,0 +1,43 @@ +{ + "version": "4.8.0", + "title": "Lucene.Net.Tests.Analysis.Kuromoji", + "buildOptions": { + "compile": { + "includeFiles": [ "../CommonAssemblyInfo.cs" ] + }, + "embed": { + "includeFiles": [ + "bocchan.utf-8", + "search-segmentation-tests.txt", + "userdict.txt" + ] + } + }, + "dependencies": { + "dotnet-test-nunit-teamcity": "3.4.0-beta-3", + "Lucene.Net.Analysis.Kuromoji": "4.8.0", + "Lucene.Net.TestFramework": "4.8.0", + "NUnit": "3.5.0" + }, + "testRunner": "nunit-teamcity", + "frameworks": { + "netcoreapp1.0": { + "imports": "dnxcore50", + "buildOptions": { + "debugType": "portable", + "define": [ "NETSTANDARD" ] + } + }, + "net451": { + "buildOptions": { + "debugType": "full", + "define": [ "FEATURE_SERIALIZABLE" ] + } + } + }, + + "runtimes": { + "win7-x86": {}, + "win7-x64": {} + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/search-segmentation-tests.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/search-segmentation-tests.txt b/src/Lucene.Net.Tests.Analysis.Kuromoji/search-segmentation-tests.txt new file mode 100644 index 0000000..835446f --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/search-segmentation-tests.txt @@ -0,0 +1,142 @@ +### +### Tests for Kuromoji's search mode heuristic +### +### In search-mode, Kuromoji uses a heuristic to do extra splitting of words +### to get a decompounding effect useful for search. This file includes tests +### for this heuristic and demonstrates its usefulness, but also weaknesses. +### +### This file's format is as follows: +### <text><tab><token1> <token2> ... <token> +### +### This file should use UTF-8 encoding and there is one test per line. The +### text to be segmented and its expected surface form token sequence is +### separated by a tab ('\t'). Tokens are separated by a half-width space. +### Whitespace lines and lines starting with a '#' are ignored. Comments +### are not allowed on entry line. +### +### NOTE: These tests depends on IPADIC +### +### Revision history: +### - 2012-01-29: Initial version +### + +## +## Organizations +## + +# Kansai Internationl Airport +é¢è¥¿å½é空港 é¢è¥¿ é¢è¥¿å½é空港/0 å½é 空港 +# Narita Airport +æç°ç©ºæ¸¯ æç° æç°ç©ºæ¸¯/0 空港 +# Haneda Airport +ç¾½ç°ç©ºæ¸¯ ç¾½ç° ç¾½ç°ç©ºæ¸¯/0 空港 +# Nara Institute of Science and Technology +å¥è¯å 端ç§å¦æè¡å¤§å¦é¢å¤§å¦ å¥è¯ å¥è¯å 端ç§å¦æè¡å¤§å¦é¢å¤§å¦/0 å 端 ç§å¦ æè¡ 大å¦é¢ å¤§å¦ +# Tokyo University +æ±äº¬å¤§å¦ æ±äº¬ æ±äº¬å¤§å¦/0 å¤§å¦ +# Kyoto University +京é½å¤§å¦ äº¬é½ äº¬é½å¤§å¦/0 å¤§å¦ + +# NOTE: differs from non-compound mode: +# Kyoto University Baseball Club +京é½å¤§å¦ç¡¬å¼éçé¨ äº¬é½å¤§ å¦ ç¡¬å¼ éç é¨ + +## +## Katakana titles +## + +# Senior Software Engineer +ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ã㢠ã·ã㢠ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢/0 ã½ããã¦ã§ã¢ ã¨ã³ã¸ã㢠+# Software Engineer +ã½ããã¦ã§ã¢ã¨ã³ã¸ã㢠ã½ããã¦ã§ã¢ ã¨ã³ã¸ã㢠+# Senior Project Manager +ã·ãã¢ããã¸ã§ã¯ãããã¸ã£ã¼ ã·ã㢠ã·ãã¢ããã¸ã§ã¯ãããã¸ã£ã¼/0 ããã¸ã§ã¯ã ããã¸ã£ã¼ +# Project Manager +ããã¸ã§ã¯ãããã¸ã£ã¼ ããã¸ã§ã¯ã ããã¸ã£ã¼ +# Senior Sales Engineer +ã·ãã¢ã»ã¼ã«ã¹ã¨ã³ã¸ã㢠ã·ã㢠ã·ãã¢ã»ã¼ã«ã¹ã¨ã³ã¸ãã¢/0 ã»ã¼ã«ã¹ ã¨ã³ã¸ã㢠+# System Architect +ã·ã¹ãã ã¢ã¼ããã¯ã ã·ã¹ãã ã·ã¹ãã ã¢ã¼ããã¯ã/0 ã¢ã¼ããã¯ã +# Senior System Architect +ã·ãã¢ã·ã¹ãã ã¢ã¼ããã¯ã ã·ã㢠ã·ãã¢ã·ã¹ãã ã¢ã¼ããã¯ã/0 ã·ã¹ãã ã¢ã¼ããã¯ã +# System Administrator +ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ +ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼ ã·ã¹ãã ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼/0 ã¢ãããã¹ãã¬ã¼ã¿ã¼ +# Senior System Administrator +ã·ãã¢ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼ ã·ã㢠ã·ãã¢ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼/0 ã·ã¹ãã ã¢ãããã¹ãã¬ã¼ã¿ã¼ + +## +## Company names (several are fictitious) +## + +# SoftBank Mobile +ã½ãããã³ã¯ã¢ãã¤ã« ã½ãããã³ã¯ ã¢ãã¤ã« +# Alpine Materials +ã¢ã«ãã¤ã³ãããªã¢ã«ãº ã¢ã«ãã¤ã³ ã¢ã«ãã¤ã³ãããªã¢ã«ãº/0 ãããªã¢ã«ãº +# Sapporo Holdings +ãµããããã¼ã«ãã£ã³ã°ã¹ ãµããã ãã¼ã«ãã£ã³ã°ã¹ +# Yamada Corporation +ã¤ããã³ã¼ãã¬ã¼ã·ã§ã³ ã¤ãã ã¤ããã³ã¼ãã¬ã¼ã·ã§ã³/0 ã³ã¼ãã¬ã¼ã·ã§ã³ +# Canon Semiconductor equipement NOTE: Semiconductor becomes semi + conductor +ãã¤ãã³ã»ãã³ã³ãã¯ã¿ã¼ã¨ã¯ã£ããã¡ã³ã ãã¤ãã³ ãã¤ãã³ã»ãã³ã³ãã¯ã¿ã¼ã¨ã¯ã£ããã¡ã³ã/0 ã»ã ã³ã³ãã¯ã¿ã¼ ã¨ã¯ã£ããã¡ã³ã +# Orental Chain +ãªãªã¨ã³ã¿ã«ãã¨ã³ ãªãªã¨ã³ã¿ã« ãªãªã¨ã³ã¿ã«ãã¨ã³/0 ãã¨ã³ +# Ally Projects Japan NOTE: Becomes one token as ããã¸ã§ã¯ã is not in IPADIC +ã¢ã¼ãªã¼ããã¸ã§ã¯ãã¸ã£ãã³ ã¢ã¼ãªã¼ããã¸ã§ã¯ãã¸ã£ãã³ +# Peter Pan Corporation +ãã¼ã¿ã¼ãã³ã³ã¼ãã¬ã¼ã·ã§ã³ ãã¼ã¿ã¼ ãã¼ã¿ã¼ãã³ã³ã¼ãã¬ã¼ã·ã§ã³/0 ãã³ ã³ã¼ãã¬ã¼ã·ã§ã³ +# AIM Create +ã¨ã¤ã ã¯ãªã¨ã¤ã ã¨ã¤ã ã¯ãªã¨ã¤ã +# Mars Engineering +ãã¼ã¹ã¨ã³ã¸ãã¢ãªã³ã° ãã¼ã¹ ãã¼ã¹ã¨ã³ã¸ãã¢ãªã³ã°/0 ã¨ã³ã¸ãã¢ãªã³ã° +# Fuji Protein Technology +ãã¸ãããã¤ã³ãã¯ããã¸ã¼ ã㸠ãã¸ãããã¤ã³ãã¯ããã¸ã¼/0 ãããã¤ã³ ãã¯ããã¸ã¼ + +## +## Person names +## + +# Michael Jackson +ãã¤ã±ã«ã¸ã£ã¯ã½ã³ ãã¤ã±ã« ã¸ã£ã¯ã½ã³ +# Steve Jobs +ã¹ãã£ã¼ãã¸ã§ã㺠ã¹ãã£ã¼ã ã¸ã§ã㺠+# Harry Potter NOTE: Becomes one token (short word) +ããªã¼ããã¿ã¼ ããªã¼ããã¿ã¼ +# Bill Gates NOTE: Becomes one token (short word) +ãã«ã²ã¤ã ãã«ã²ã¤ã +# Sean Connery NOTE: Becomes one token (okay) +ã·ã§ã¼ã³ã³ããªã¼ ã·ã§ã¼ã³ã³ããªã¼ + +## +## Other nouns +## + +# Holdings +ãã¼ã«ãã£ã³ã°ã¹ ãã¼ã«ãã£ã³ã°ã¹ +# Engineering +ã¨ã³ã¸ãã¢ãªã³ã° ã¨ã³ã¸ãã¢ãªã³ã° +# Software Engineering +ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢ãªã³ã° ã½ããã¦ã§ã¢ ã¨ã³ã¸ãã¢ãªã³ã° +# Shopping center +ã·ã§ããã³ã°ã»ã³ã¿ã¼ ã·ã§ããã³ã° ã»ã³ã¿ã¼ +# Game center (arcade) NOTE: One token because of short word +ã²ã¼ã ã»ã³ã¿ã¼ ã²ã¼ã ã»ã³ã¿ã¼ +# Christmas shopping +ã¯ãªã¹ãã¹ã·ã§ããã³ã° ã¯ãªã¹ãã¹ ã·ã§ããã³ã° +# Download file +ãã¦ã³ãã¼ããã¡ã¤ã« ãã¦ã³ãã¼ã ãã¡ã¤ã« +# Technology +ãã¯ããã¸ã¼ ãã¯ããã¸ã¼ +# Lillehammer Olympics +ãªã¬ãã³ã¡ã«ãªãªã³ãã㯠ãªã¬ãã³ã¡ã« ãªãªã³ãã㯠+ +## +## Problematic terms +## + +# JT Engineering NOTE: Becomes J Tien ginia ring (substrings are in IPADIC) +ã¸ã§ã¤ãã£ã¨ã³ã¸ãã¢ãªã³ã° ã¸ã§ã¤ ã¸ã§ã¤ãã£ã¨ã³ã¸ãã¢ãªã³ã°/0 ãã£ã¨ã³ ã¸ã㢠ãªã³ã° +# Anchovy pasta NOTE: Become Anch yvipasta +ã¢ã³ãã§ããã¹ã¿ ã¢ã³ã ã¢ã³ãã§ããã¹ã¿/0 ã§ããã¹ã¿ +# Surprise gift NOTE: Becomes one token (surprise not in IPADIC) +ãµãã©ã¤ãºã®ãã ãµãã©ã¤ãºã®ãã http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/userdict.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/userdict.txt b/src/Lucene.Net.Tests.Analysis.Kuromoji/userdict.txt new file mode 100644 index 0000000..f9db02c --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/userdict.txt @@ -0,0 +1,10 @@ +# Custom segmentation for long entries +æ¥æ¬çµæ¸æ°è,æ¥æ¬ çµæ¸ æ°è,ããã³ ã±ã¤ã¶ã¤ ã·ã³ãã³,ã«ã¹ã¿ã åè© +é¢è¥¿å½é空港,é¢è¥¿ å½é 空港,ã«ã³ãµã¤ ã³ã¯ãµã¤ ã¯ã¦ã³ã¦,ãã¹ãåè© + +# Custom reading for sumo wrestler +æéé¾,æéé¾,ã¢ãµã·ã§ã¦ãªã¥ã¦,ã«ã¹ã¿ã 人å + +# Silly entry: +abcd,a b cd,foo1 foo2 foo3,bar +abcdefg,ab cd efg,foo1 foo2 foo4,bar http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net/Support/Collections.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net/Support/Collections.cs b/src/Lucene.Net/Support/Collections.cs index dcafc25..3ded8e3 100644 --- a/src/Lucene.Net/Support/Collections.cs +++ b/src/Lucene.Net/Support/Collections.cs @@ -54,6 +54,15 @@ namespace Lucene.Net.Support return new SetFromMap<T>(map); } + public static void Reverse<T>(IList<T> list) + { + int size = list.Count; + for (int i = 0, mid = size >> 1, j = size - 1; i < mid; i++, j--) + { + Swap(list, i, j); + } + } + public static IComparer<T> ReverseOrder<T>() { return (IComparer<T>)ReverseComparer<T>.REVERSE_ORDER;
