languagetool

taolin2011 Fri, 17 Aug 2012 03:36:22 -0700

Revision: 7871
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=7871&view=rev
Author:   taolin2011
Date:     2012-08-17 10:36:09 +0000 (Fri, 17 Aug 2012)
Log Message:
-----------
unit tests for Chinese support


Added Paths:
-----------
    trunk/JLanguageTool/src/test/org/languagetool/tagging/zh/
    
trunk/JLanguageTool/src/test/org/languagetool/tagging/zh/ChineseTaggerTest.java
    trunk/JLanguageTool/src/test/org/languagetool/tokenizers/zh/
    
trunk/JLanguageTool/src/test/org/languagetool/tokenizers/zh/ChineseSentenceTokenizerTest.java
    
trunk/JLanguageTool/src/test/org/languagetool/tokenizers/zh/ChineseWordTokenizerTest.java

Added: 
trunk/JLanguageTool/src/test/org/languagetool/tagging/zh/ChineseTaggerTest.java
===================================================================
--- 
trunk/JLanguageTool/src/test/org/languagetool/tagging/zh/ChineseTaggerTest.java 
                            (rev 0)
+++ 
trunk/JLanguageTool/src/test/org/languagetool/tagging/zh/ChineseTaggerTest.java 
    2012-08-17 10:36:09 UTC (rev 7871)
@@ -0,0 +1,96 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+package org.languagetool.tagging.zh;
+
+import java.io.IOException;
+
+import junit.framework.TestCase;
+import org.languagetool.TestTools;
+import org.languagetool.tokenizers.zh.ChineseWordTokenizer;
+
+/**
+ * The test of ChineseTagger.
+ * 
+ * @author Minshan Chen
+ * @author Xiaohui Wu
+ * @author Jiamin Zheng
+ */
+public class ChineseTaggerTest extends TestCase {
+
+       private ChineseTagger tagger;
+       private ChineseWordTokenizer tokenizer;
+
+       public void setUp() {
+               tagger = new ChineseTagger();
+               tokenizer = new ChineseWordTokenizer();
+       }
+
+       public void testTagger() throws IOException {
+
+               TestTools
+                               .myAssert(
+                                               "主任强调指出错误的地方。",
+                                               "主任/[null]n -- 强调/[null]vd -- 
指出/[null]v -- 错误/[null]a -- 的/[null]u -- 地方/[null]n -- 。/[null]w",
+                                               tokenizer, tagger);
+
+               TestTools
+                               .myAssert(
+                                               "她胸前挂着一块碧绿的玉。",
+                                               "她/[null]r -- 胸前/[null]s -- 
挂/[null]v -- 着/[null]u -- 一/[null]m -- 块/[null]q -- 碧绿/[null]z -- 的/[null]u -- 
玉/[null]n -- 。/[null]w",
+                                               tokenizer, tagger);
+
+               TestTools
+                               .myAssert(
+                                               "“鲯鳅”的研究结果有什么奥妙？",
+                                               "“/[null]w -- 鲯/[null]x -- 
鳅/[null]x -- ”/[null]w -- 的/[null]u -- 研究/[null]vn -- 结果/[null]n -- 有/[null]v 
-- 什么/[null]r -- 奥妙/[null]an -- ？/[null]w",
+                                               tokenizer, tagger);
+
+               TestTools
+                               .myAssert(
+                                               "我们的女组长真是尺竿头更进一步。",
+                                               "我们/[null]r -- 的/[null]u -- 
女/[null]b -- 组长/[null]n -- 真/[null]d -- 是/[null]v -- 尺/[null]ng -- 竿/[null]ng 
-- 头/[null]n -- 更进一步/[null]l -- 。/[null]w",
+                                               tokenizer, tagger);
+
+               TestTools
+                               .myAssert(
+                                               "国务院，非国家工作人员不能随便进去的地方。",
+                                               "国务院/[null]nt -- ，/[null]w -- 
非/[null]h -- 国家/[null]n -- 工作/[null]vn -- 人员/[null]n -- 不能/[null]v -- 
随便/[null]ad -- 进去/[null]v -- 的/[null]u -- 地方/[null]n -- 。/[null]w",
+                                               tokenizer, tagger);
+
+               TestTools
+                               .myAssert(
+                                               "“哇……”珠海北师大操场上的师生大吃一惊！",
+                                               "“/[null]w -- 哇/[null]y -- 
…/[null]w -- …/[null]w -- ”/[null]w -- 珠海/[null]ns -- 北师大/[null]j -- 操场/[null]n 
-- 上/[null]f -- 的/[null]u -- 师生/[null]n -- 大吃一惊/[null]i -- ！/[null]w",
+                                               tokenizer, tagger);
+
+               TestTools
+                               .myAssert(
+                                               "在炎热的暑假里，我和其他同学们参加了姜老师的一个项目。",
+                                               "在/[null]p -- 炎热/[null]a -- 
的/[null]u -- 暑假/[null]t -- 里/[null]f -- ，/[null]w -- 我/[null]r -- 和/[null]c -- 
其他/[null]r -- 同学/[null]n -- 们/[null]k -- 参加/[null]v -- 了/[null]u -- 姜/[null]n 
-- 老师/[null]n -- 的/[null]u -- 一个/[null]m -- 项目/[null]n -- 。/[null]w",
+                                               tokenizer, tagger);
+
+               TestTools
+                               .myAssert(
+                                               "“咕咚，”一台联想ThinkPad 
T系列电脑从关羽的宿舍飞了下来。",
+                                               "“/[null]w -- 咕咚/[null]o -- 
，/[null]w -- ”/[null]w -- 一/[null]m -- 台/[null]q -- 联想/[null]nz -- 
ThinkPad/[null]nx -- T/[null]nx -- 系列/[null]q -- 电脑/[null]n -- 从/[null]p -- 
关羽/[null]nr -- 的/[null]u -- 宿舍/[null]n -- 飞/[null]v -- 了/[null]u -- 下来/[null]v 
-- 。/[null]w",
+                                               tokenizer, tagger);
+
+       }
+}

Added: 
trunk/JLanguageTool/src/test/org/languagetool/tokenizers/zh/ChineseSentenceTokenizerTest.java
===================================================================
--- 
trunk/JLanguageTool/src/test/org/languagetool/tokenizers/zh/ChineseSentenceTokenizerTest.java
                               (rev 0)
+++ 
trunk/JLanguageTool/src/test/org/languagetool/tokenizers/zh/ChineseSentenceTokenizerTest.java
       2012-08-17 10:36:09 UTC (rev 7871)
@@ -0,0 +1,76 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+package org.languagetool.tokenizers.zh;
+
+import junit.framework.TestCase;
+
+import org.languagetool.TestTools;
+import org.languagetool.tokenizers.SentenceTokenizer;
+
+/**
+ * The test of ChineseSentenceTokenizer.
+ * 
+ * @author Minshan Chen
+ * @author Xiaohui wu
+ * @author Jiamin zheng
+ */
+public class ChineseSentenceTokenizerTest extends TestCase {
+
+       private final SentenceTokenizer stokenizer = new 
ChineseSentenceTokenizer();
+
+       public void testTokenize() {
+
+               String t1 = "他说：";
+               String t2 = "我们是中国人";
+               String t3 = "中国人很好";
+
+               char[] symbol1 = { '_', '/', ';', ':', '!', '@', '#', '$', '%', 
'^',
+                               '&', '.', '+', '*', '?' };
+               for (char i : symbol1) {
+                       testSplit(t2 + i + t3);// 例子：我们是中国人_中国人很好
+               }
+
+               char[] symbol2 = { '，', '：', '…', '！', '？', '、', '；', '。' };
+               for (char i : symbol2) {
+                       testSplit(t2 + i, t3);// 例子：我们是中国人，/中国人很好
+               }
+
+               String[] symbol3 = { "\"", "\'", "‘", "(", "（", "“", "”", "）", 
")",
+                               "’", "\'", "\"" };
+               for (int i = 0; i < symbol3.length / 2; i++) {
+
+                       testSplit(t1, symbol3[i], t2 + "，", t3
+                                       + symbol3[symbol3.length - 1 - i]); // 
例子:他说：/"/我们是中国人，/中国 人很好"
+               }
+
+               String[] symbol4 = { "〝", "『", "«", "「", "〖", "{", "【", "[", 
"<", "《",
+                               "》", ">", "]", "】", "}", "〗", "」", "»", "』", 
"〞" };
+               for (int i = 0; i < symbol4.length / 2; i++) {
+                       testSplit(t1, symbol4[i] + t2 + "，", t3
+                                       + symbol4[symbol4.length - 1 - i]); // 
他说：/〝我们是中国人，/中国人很好〞
+               }
+
+       }
+
+       private void testSplit(final String... sentences) {
+               TestTools.testSplit(sentences, stokenizer);
+       }
+
+}

Added: 
trunk/JLanguageTool/src/test/org/languagetool/tokenizers/zh/ChineseWordTokenizerTest.java
===================================================================
--- 
trunk/JLanguageTool/src/test/org/languagetool/tokenizers/zh/ChineseWordTokenizerTest.java
                           (rev 0)
+++ 
trunk/JLanguageTool/src/test/org/languagetool/tokenizers/zh/ChineseWordTokenizerTest.java
   2012-08-17 10:36:09 UTC (rev 7871)
@@ -0,0 +1,88 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+package org.languagetool.tokenizers.zh;
+
+import java.util.List;
+
+import junit.framework.TestCase;
+
+/**
+ * The test of ChineseWordTokenizer.
+ * 
+ * @author Minshan Chen
+ * @author Xiaohui Wu
+ * @author Jiamin Zheng
+ */
+public class ChineseWordTokenizerTest extends TestCase {
+
+       public void testTokenize() {
+
+               ChineseWordTokenizer wordTokenizer = new ChineseWordTokenizer();
+
+               List<String> tokens = wordTokenizer.tokenize("主任强调指出错误的地方。");
+               assertEquals(tokens.size(), 7);
+               assertEquals("[主任/n, 强调/vd, 指出/v, 错误/a, 的/u, 地方/n, 。/w]",
+                               tokens.toString());
+
+               List<String> tokens2 = wordTokenizer.tokenize("她胸前挂着一块碧绿的玉。");
+               assertEquals(tokens2.size(), 10);
+               assertEquals("[她/r, 胸前/s, 挂/v, 着/u, 一/m, 块/q, 碧绿/z, 的/u, 玉/n, 
。/w]",
+                               tokens2.toString());
+
+               List<String> tokens3 = 
wordTokenizer.tokenize("“鲯鳅”的研究结果有什么奥妙？");
+               assertEquals(tokens3.size(), 11);
+               assertEquals(
+                               "[“/w, 鲯/x, 鳅/x, ”/w, 的/u, 研究/vn, 结果/n, 有/v, 
什么/r, 奥妙/an, ？/w]",
+                               tokens3.toString());
+
+               List<String> tokens4 = 
wordTokenizer.tokenize("我们的女组长真是尺竿头更进一步。");
+               assertEquals(tokens4.size(), 11);
+               assertEquals(
+                               "[我们/r, 的/u, 女/b, 组长/n, 真/d, 是/v, 尺/ng, 竿/ng, 
头/n, 更进一步/l, 。/w]",
+                               tokens4.toString());
+
+               List<String> tokens5 = 
wordTokenizer.tokenize("国务院，非国家工作人员不能随便进去的地方。");
+               assertEquals(tokens5.size(), 12);
+               assertEquals(
+                               "[国务院/nt, ，/w, 非/h, 国家/n, 工作/vn, 人员/n, 不能/v, 
随便/ad, 进去/v, 的/u, 地方/n, 。/w]",
+                               tokens5.toString());
+
+               List<String> tokens6 = 
wordTokenizer.tokenize("“哇……”珠海北师大操场上的师生大吃一惊！");
+               assertEquals(tokens6.size(), 13);
+               assertEquals(
+                               "[“/w, 哇/y, …/w, …/w, ”/w, 珠海/ns, 北师大/j, 操场/n, 
上/f, 的/u, 师生/n, 大吃一惊/i, ！/w]",
+                               tokens6.toString());
+
+               List<String> tokens7 = wordTokenizer
+                               .tokenize("在炎热的暑假里，我和其他同学们参加了姜老师的一个项目。");
+               assertEquals(tokens7.size(), 19);
+               assertEquals(
+                               "[在/p, 炎热/a, 的/u, 暑假/t, 里/f, ，/w, 我/r, 和/c, 
其他/r, 同学/n, 们/k, 参加/v, 了/u, 姜/n, 老师/n, 的/u, 一个/m, 项目/n, 。/w]",
+                               tokens7.toString());
+
+               List<String> tokens8 = wordTokenizer
+                               .tokenize("“咕咚，”一台联想ThinkPad T系列电脑从关羽的宿舍飞了下来。");
+               assertEquals(tokens8.size(), 20);
+               assertEquals(
+                               "[“/w, 咕咚/o, ，/w, ”/w, 一/m, 台/q, 联想/nz, 
ThinkPad/nx, , T/nx, 系列/q, 电脑/n, 从/p, 关羽/nr, 的/u, 宿舍/n, 飞/v, 了/u, 下来/v, 。/w]",
+                               tokens8.toString());
+       }
+
+}

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
Languagetool-cvs mailing list
Languagetool-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs

[LanguageTool] SF.net SVN: languagetool:[7871] trunk/JLanguageTool/src/test/org/ languagetool

Reply via email to