Revision: 8524
http://languagetool.svn.sourceforge.net/languagetool/?rev=8524&view=rev
Author: dnaber
Date: 2012-12-09 18:21:57 +0000 (Sun, 09 Dec 2012)
Log Message:
-----------
[de] sentence splitter fix: a sentence boundary was missed after "Das ist ja
1a. Und..."
Modified Paths:
--------------
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/segment.srx
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/de/GermanSRXSentenceTokenizerTest.java
Modified:
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/segment.srx
===================================================================
---
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/segment.srx
2012-12-09 18:10:53 UTC (rev 8523)
+++
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/segment.srx
2012-12-09 18:21:57 UTC (rev 8524)
@@ -25,6 +25,7 @@
</header>
<body>
<languagerules>
+
<languagerule languagerulename="Greek">
<!--κ.λπ. - και λοιπά-->
<rule break="no">
@@ -54,6 +55,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Polish">
<rule break="no">
<beforebreak>\badw\.\s</beforebreak>
@@ -1039,6 +1041,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="English">
<rule break="no">
<beforebreak>\b[nN]o\.\s</beforebreak>
@@ -1214,6 +1217,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Romanian">
<rule break="no">
<beforebreak>\b\d+\.\s</beforebreak>
@@ -1308,6 +1312,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Dutch">
<rule break="no">
<beforebreak>\b(Afr|Am|Ar|Br|Cie|Comp|Dhr|Dr|Em|Fa|Kon)\.\s</beforebreak>
@@ -1459,6 +1464,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Slovak">
<rule break="no">
<beforebreak>\b(Bc|Mgr|RNDr|PharmDr|PhDr|JUDr|PaedDr|ThDr|Ing|MUDr|MDDr|MVDr|Dr|ThLic|PhD|ArtD|ThDr|Dr|DrSc|CSs|prof)\.\s</beforebreak>
@@ -3133,6 +3139,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Icelandic">
<!-- Numbers -->
<rule break="no">
@@ -3958,6 +3965,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Russian">
<rule break="no">
<beforebreak>\b\d+\.\s</beforebreak>
@@ -4057,6 +4065,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Default">
<rule break="yes">
<beforebreak>\u2029</beforebreak>
@@ -4073,18 +4082,21 @@
<afterbreak></afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="ByLineBreak">
<rule break="yes">
<beforebreak>\r?\n</beforebreak>
<afterbreak></afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="ByTwoLineBreaks">
<rule break="yes">
<beforebreak>\r?\n\s*\r?\n[\t]*</beforebreak>
<afterbreak></afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Slovenian">
<rule break="no">
<beforebreak>\b[dD]r\.\s</beforebreak>
@@ -4239,6 +4251,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Catalan">
<!-- Abbreviations that cannot finish sentences-->
<rule break="no">
@@ -4307,6 +4320,7 @@
<afterbreak>[¡¿«»"'\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Spanish">
<!-- Abbreviations that cannot finish sentences-->
<rule break="no">
@@ -4375,6 +4389,7 @@
<afterbreak>[¡¿«»"'\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="German">
<!-- Split e.g.: He won't. Really. -->
<rule break="yes">
@@ -4391,7 +4406,12 @@
<beforebreak>[ap]\.m\.\s</beforebreak>
<afterbreak>\p{Lu}</afterbreak>
</rule>
-<!-- Don't split at e.g. "U. S. A." -->
+<!-- Split at e.g. "1a. Und ..." -->
+<rule break="yes">
+<beforebreak>\d+[a-z]\.\s</beforebreak>
+<afterbreak>\p{Lu}</afterbreak>
+</rule>
+<!-- Don't split at e.g. "d. h." -->
<rule break="no">
<beforebreak>[^-\p{L}]\p{L}[\.!?…]['|"|«|\)|\]|\}]?\s</beforebreak>
<afterbreak></afterbreak>
@@ -4518,6 +4538,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Danish">
<!-- Split e.g.: He won't. Really. -->
<rule break="yes">
@@ -4596,7 +4617,7 @@
<afterbreak>januar|februar|marts|april|maj|juni|juli|august|september|oktober|november|december</afterbreak>
</rule>
<!-- einige deutsche Monate, vor denen eine Zahl erscheinen kann,
- ohne dass eine Satzgrenze erkannt wird
+ ohne dass eine Satzgrenze erkannt wird
(z.B. "am 13. Dezember" -> keine Satzgrenze) -->
<rule break="no">
<beforebreak>\d+\.\s</beforebreak>
@@ -4671,6 +4692,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Esperanto">
<!-- Esperanto abbreviations (see
http://eo.lernu.net/lernado/gramatiko/demandoj/mallongigoj.php) -->
<rule break="no">
@@ -4699,6 +4721,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Ukrainian">
<rule break="no">
<beforebreak>\b\d+\.\s</beforebreak>
@@ -4838,6 +4861,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Belarusian">
<rule break="no">
<beforebreak>\b\d+\.\s</beforebreak>
@@ -4925,6 +4949,7 @@
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Galician">
<!-- s. XIX; s.IX; sec. XX; séc. XX -->
<rule break="no">
@@ -5040,6 +5065,7 @@
<afterbreak>['"«¡¿\p{Ps}\p{Pi}]?\p{Lu}\p{Ll}*</afterbreak>
</rule>
</languagerule>
+
<languagerule languagerulename="Japanese">
<rule break="no">
<beforebreak>[:]+[\p{Pe}\p{Pf}\p{Po}"-[\u002C\u003A\u003B\u055D\u060C\u061B\u0703\u0704\u0705\u0706\u0707\u0708\u0709\u07F8\u1363\u1364\u1365\u1366\u1802\u1804\u1808\u204F\u205D\u3001\uA60D\uFE10\uFE11\uFE13\uFE14\uFE50\uFE51\uFE54\uFE55\uFF0C\uFF1A\uFF1B\uFF64]]*</beforebreak>
Modified:
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/de/GermanSRXSentenceTokenizerTest.java
===================================================================
---
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/de/GermanSRXSentenceTokenizerTest.java
2012-12-09 18:10:53 UTC (rev 8523)
+++
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/de/GermanSRXSentenceTokenizerTest.java
2012-12-09 18:21:57 UTC (rev 8524)
@@ -43,6 +43,7 @@
testSplit("Das Schreiben ist auf den 31.1. datiert.");
testSplit("Das Schreiben ist auf den 3.10.2000 datiert.");
testSplit("Natürliche Vererbungsprozesse prägten sich erst im 18. und
frühen 19. Jahrhundert aus.");
+ testSplit("Das ist ja 1a. ", "Und das auch.");
testSplit("Friedrich I., auch bekannt als Friedrich der Große.");
testSplit("Friedrich II., auch bekannt als Friedrich der Große.");
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
LogMeIn Rescue: Anywhere, Anytime Remote support for IT. Free Trial
Remotely access PCs and mobile devices and provide instant support
Improve your efficiency, and focus on delivering more value-add services
Discover what IT Professionals Know. Rescue delivers
http://p.sf.net/sfu/logmein_12329d2d
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits