Revision: 10030
http://sourceforge.net/p/languagetool/code/10030
Author: arysin
Date: 2013-05-01 03:06:44 +0000 (Wed, 01 May 2013)
Log Message:
-----------
Ukrainian sentence tokenizer update with tests
Ukrainian grammar rules update
Modified Paths:
--------------
trunk/languagetool/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx
trunk/languagetool/languagetool-language-modules/uk/src/main/java/org/languagetool/tokenizers/uk/UkrainianWordTokenizer.java
trunk/languagetool/languagetool-language-modules/uk/src/main/resources/org/languagetool/rules/uk/grammar.xml
Added Paths:
-----------
trunk/languagetool/languagetool-language-modules/uk/src/test/java/org/languagetool/tokenizers/
trunk/languagetool/languagetool-language-modules/uk/src/test/java/org/languagetool/tokenizers/uk/
trunk/languagetool/languagetool-language-modules/uk/src/test/java/org/languagetool/tokenizers/uk/UkrainianSRXSentenceTokenizerTest.java
Modified:
trunk/languagetool/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx
===================================================================
---
trunk/languagetool/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx
2013-04-30 23:38:52 UTC (rev 10029)
+++
trunk/languagetool/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx
2013-05-01 03:06:44 UTC (rev 10030)
@@ -4742,29 +4742,23 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Ukrainian">
<rule break="no">
<beforebreak>\b\d+\.\s</beforebreak>
<afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
</rule>
-<!-- capital char abbreviations А. Б. В. -->
+<!-- TODO: Name.Surname - does not work for some reason -->
<rule break="no">
-<beforebreak>\b[А-Я]\.\s</beforebreak>
+<beforebreak>\b[А-ЯІЇЄҐ]\.[А-ЯІЇЄҐ]</beforebreak>
<afterbreak></afterbreak>
</rule>
+<!-- capital char abbreviations А. Б. В. -->
<rule break="no">
-<beforebreak>\bҐ\.\s</beforebreak>
+<beforebreak>\s[А-ЯІЇЄҐ]\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
-<beforebreak>\bЇ\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
-<rule break="no">
-<beforebreak>\bЄ\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
-<rule break="no">
<beforebreak>\b[A-Z]\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
@@ -4774,10 +4768,14 @@
</rule>
<!-- date/time -->
<rule break="no">
-<beforebreak>\b[0-9]+(р)\.\s</beforebreak>
+<beforebreak>\b([0-9]{2}|[0-9]{4})(р)\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
+<beforebreak>\b([0-9]0|[0-9]{3}0)(-х)?(рр)\.\s</beforebreak>
+<afterbreak></afterbreak>
+</rule>
+<rule break="no">
<beforebreak>\b[XVILMC]+(ст)\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
@@ -4791,59 +4789,19 @@
</rule>
<!--Measures -->
<rule break="no">
-<beforebreak>\b[0-9]+(г|гг|грн|млн|млрд|руб|тис)\.\s</beforebreak>
+<beforebreak>\b[0-9]+(г|гг|грн|млн|млрд|тис)\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<!-- other abbreviations -->
<rule break="no">
-<beforebreak>\bв\.\s</beforebreak>
+<beforebreak>\b(вул|р|рр|грн|млн|млрд|ст|тис)\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
-<beforebreak>\bвв\.\s</beforebreak>
+<beforebreak>\b(буд|кв)\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<rule break="no">
-<beforebreak>\bвул\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
-<rule break="no">
-<beforebreak>\bг\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
-<rule break="no">
-<beforebreak>\bгг\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
-<rule break="no">
-<beforebreak>\bгрн\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
-<rule break="no">
-<beforebreak>\bмлн\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
-<rule break="no">
-<beforebreak>\bмлрд\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
-<rule break="no">
-<beforebreak>\bруб\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
-<rule break="no">
-<beforebreak>\bст\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
-<rule break="no">
-<beforebreak>\bр\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
-<rule break="no">
-<beforebreak>\bтис\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
-<rule break="no">
<beforebreak>['"„][\.!?…]['"”]\s</beforebreak>
<afterbreak></afterbreak>
</rule>
@@ -4881,6 +4839,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Belarusian">
<rule break="no">
<beforebreak>\b\d+\.\s</beforebreak>
Modified:
trunk/languagetool/languagetool-language-modules/uk/src/main/java/org/languagetool/tokenizers/uk/UkrainianWordTokenizer.java
===================================================================
---
trunk/languagetool/languagetool-language-modules/uk/src/main/java/org/languagetool/tokenizers/uk/UkrainianWordTokenizer.java
2013-04-30 23:38:52 UTC (rev 10029)
+++
trunk/languagetool/languagetool-language-modules/uk/src/main/java/org/languagetool/tokenizers/uk/UkrainianWordTokenizer.java
2013-05-01 03:06:44 UTC (rev 10030)
@@ -44,14 +44,14 @@
@Override
public List<String> tokenize(final String text) {
- final List<String> l = new ArrayList<String>();
+ final List<String> tokenList = new ArrayList<String>();
final StringTokenizer st = new StringTokenizer(text, SPLIT_CHARS, true);
while (st.hasMoreElements()) {
- l.add( clean(st.nextToken()) );
+ tokenList.add( clean(st.nextToken()) );
}
- return l;
+ return tokenList;
}
private static String clean(String token) {
Modified:
trunk/languagetool/languagetool-language-modules/uk/src/main/resources/org/languagetool/rules/uk/grammar.xml
===================================================================
---
trunk/languagetool/languagetool-language-modules/uk/src/main/resources/org/languagetool/rules/uk/grammar.xml
2013-04-30 23:38:52 UTC (rev 10029)
+++
trunk/languagetool/languagetool-language-modules/uk/src/main/resources/org/languagetool/rules/uk/grammar.xml
2013-05-01 03:06:44 UTC (rev 10030)
@@ -116,13 +116,22 @@
<rule>
<pattern case_sensitive="yes">
<token negate="yes"
regexp="yes">[\p{Punct}–—\(«АаІі]<exception
postag="SENT_START"></exception></token>
- <token
regexp="yes">мабуть|врешті|по-перше|по-друге|по-третє|щоправда|о?крім
того|втім</token>
+ <token regexp="yes">мабуть|врешті|щоправда|о?крім
того|втім</token>
</pattern>
<message>Відсутня ліва кома: <suggestion>\1,
\2</suggestion>.</message>
- <example type="correct"><marker>Мабуть,</marker> це
водій.</example>
- <example type="correct">Це<marker>, мабуть</marker>,
водій.</example>
+ <example type="correct">Мабуть, це водій.</example>
+ <example type="correct">Це, мабуть, водій.</example>
<example type="incorrect"><marker>Це мабуть</marker>,
водій.</example>
</rule>
+ <rule>
+ <pattern case_sensitive="yes">
+ <token negate="yes"
regexp="yes">[\p{Punct}–—\(«АаІі]|Це<exception
postag="SENT_START"></exception></token>
+ <token regexp="yes">по-перше|по-друге|по-третє</token>
+ </pattern>
+ <message>Відсутня ліва кома: <suggestion>\1,
\2</suggestion>.</message>
+ <example type="correct">Це по-перше.</example>
+ <example type="incorrect">По-перше, <marker>нашпигувати
по-друге</marker>, запекти.</example>
+ </rule>
</rulegroup>
<rulegroup name="Відсутня кома перед «але», «однак», ..."
id="COMMA_BEFORE_BUT">
Added:
trunk/languagetool/languagetool-language-modules/uk/src/test/java/org/languagetool/tokenizers/uk/UkrainianSRXSentenceTokenizerTest.java
===================================================================
---
trunk/languagetool/languagetool-language-modules/uk/src/test/java/org/languagetool/tokenizers/uk/UkrainianSRXSentenceTokenizerTest.java
(rev 0)
+++
trunk/languagetool/languagetool-language-modules/uk/src/test/java/org/languagetool/tokenizers/uk/UkrainianSRXSentenceTokenizerTest.java
2013-05-01 03:06:44 UTC (rev 10030)
@@ -0,0 +1,52 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package org.languagetool.tokenizers.uk;
+
+import junit.framework.TestCase;
+import org.languagetool.TestTools;
+import org.languagetool.language.Ukrainian;
+import org.languagetool.tokenizers.SRXSentenceTokenizer;
+
+/*
+ * Ukrainian SRX Sentence Tokenizer Test
+ */
+public class UkrainianSRXSentenceTokenizerTest extends TestCase {
+
+ private final SRXSentenceTokenizer stokenizer = new SRXSentenceTokenizer(new
Ukrainian());
+
+ public final void testTokenize() {
+ testSplit("Це просте речення.");
+ testSplit("Вони приїхали в Париж. ", "Але там їм геть не сподобалося.");
+ testSplit("Панк-рок — напрям у рок-музиці, що виник у середині 1970-х рр.
у США і Великобританії.");
+ testSplit("Разом із втечами, вже у XV ст. почастішали збройні виступи
селян.");
+ testSplit("На початок 1994 р. державний борг України становив 4,8 млрд.
дол.");
+// TODO:
+// testSplit("На початок 1994 р. державний борг України становив 4,8 млрд.
", "Досить значна сума.");
+ testSplit("Київ, вул. Сагайдачного, буд. 43, кв. 4.");
+ testSplit("Наша зустріч з А. Марчуком відбулася в грудні минулого року.");
+// TODO:
+// testSplit("Наша зустріч з А.Марчуком відбулася в грудні минулого року.");
+ }
+
+ private void testSplit(final String... sentences) {
+ TestTools.testSplit(sentences, stokenizer);
+ }
+
+}
Property changes on:
trunk/languagetool/languagetool-language-modules/uk/src/test/java/org/languagetool/tokenizers/uk/UkrainianSRXSentenceTokenizerTest.java
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Introducing AppDynamics Lite, a free troubleshooting tool for Java/.NET
Get 100% visibility into your production application - at no cost.
Code-level diagnostics for performance bottlenecks with <2% overhead
Download for free and get started troubleshooting in minutes.
http://p.sf.net/sfu/appdyn_d2d_ap1
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits