Revision: 9041
http://languagetool.svn.sourceforge.net/languagetool/?rev=9041&view=rev
Author: jaumeortola
Date: 2013-01-16 19:14:42 +0000 (Wed, 16 Jan 2013)
Log Message:
-----------
[ca] Sentence tokenization improved.
Modified Paths:
--------------
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/segment.srx
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanSentenceTokenizerTest.java
Modified:
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/segment.srx
===================================================================
---
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/segment.srx
2013-01-16 18:44:07 UTC (rev 9040)
+++
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/segment.srx
2013-01-16 19:14:42 UTC (rev 9041)
@@ -4252,18 +4252,9 @@
<beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
-<!-- Solve problem with apostrophe before last word of sentence (why this
happens?)-->
-<rule break="yes">
-<beforebreak>\b[ldmtsnLDMTSN]'[\p{L}]{2,}\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
-<rule break="yes">
-<beforebreak>\b[ldmtsnLDMTSN]'[\p{Ll}]\.\s</beforebreak>
-<afterbreak></afterbreak>
-</rule>
<!-- Abbreviations that cannot finish sentences-->
-<rule break="no">
-<beforebreak>\b(C|Dr|Dra|E|Emm|Emma|Excm|Excma|H|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|M|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|alt|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|art|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cant|cap|cartogr|cast|cat|catedr|catol|cert|cf|cia|cin|cint|circul|cit|climat|col|col·l|com|compt|cons|constr|contr|conv|cor|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dc|dept|derog|des|desp|dg|dip|dir|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|etc|ex|exc|exp|exped|ext|f|fac|fca|febr|fra|g|gen|gov|gr|gral|h|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|m|merc|mil·l|màx|mín|n|neg|nom|nov|nre|núm|o|oct|op|p|paq|par|parc|part|pda|pg|pl|pobl|pol|ppda|ppt|pral|pres|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|s|secr|serv|set|sgt|sotsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|u|univ|urb|v|veg|venc|vid|vig|vocab|vol|x|àt|íd)\.\s</beforebreak>
+<rule break="no"> <!-- removed from abbreviations: u -->
+<beforebreak>\b(C|Dr|Dra|E|Emm|Emma|Excm|Excma|H|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|M|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|alt|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|art|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cant|cap|cartogr|cast|cat|catedr|catol|cert|cf|cia|cin|cint|circul|cit|climat|col|col·l|com|compt|cons|constr|contr|conv|cor|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dc|dept|derog|des|desp|dg|dip|dir|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|etc|ex|exc|exp|exped|ext|f|fac|fca|febr|fra|g|gen|gov|gr|gral|h|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|m|merc|mil·l|màx|mín|n|neg|nom|nov|nre|núm|o|oct|op|p|paq|par|parc|part|pda|pg|pl|pobl|pol|ppda|ppt|pral|pres|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|s|secr|serv|set|sgt|sotsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|veg|venc|vid|vig|vocab|vol|x|àt|íd)\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<!-- Abbreviations that can finish sentences -->
@@ -4317,9 +4308,14 @@
<afterbreak></afterbreak>
</rule>
<rule break="yes">
-<beforebreak>\s\p{L}+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+\s*</beforebreak>
-<afterbreak>[¡¿«»"'\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
+<beforebreak>\s[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+\s*</beforebreak>
+<afterbreak>[¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
</rule>
+<!-- paragraphs with opening "»" in dialogs-->
+<rule break="yes">
+<beforebreak>[\.:!?…»]+\s</beforebreak>
+<afterbreak>»[^\s\.:!?…]</afterbreak>
+</rule>
</languagerule>
<languagerule languagerulename="Spanish">
<!-- Abbreviations that cannot finish sentences-->
Modified:
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanSentenceTokenizerTest.java
===================================================================
---
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanSentenceTokenizerTest.java
2013-01-16 18:44:07 UTC (rev 9040)
+++
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanSentenceTokenizerTest.java
2013-01-16 19:14:42 UTC (rev 9041)
@@ -33,6 +33,19 @@
public final void testTokenize() {
+ // Simple sentences
+ testSplit(new String[] { "Això és una frase. ", "Això és una
altra frase." });
+ testSplit(new String[] { "Aquesta és l'egua. ", "Aquell és el
cavall." });
+ testSplit(new String[] { "Aquesta és l'egua? ", "Aquell és el
cavall." });
+ testSplit(new String[] { "Vols col·laborar? ", "Sí, i tant." });
+ testSplit(new String[] { "Com vas d'il·lusió? ", "Bé, bé." });
+ testSplit(new String[] { "Com vas d’il·lusió? ", "Bé, bé." });
+ testSplit(new String[] { "És d’abans-d’ahir? ", "Bé, bé." });
+ testSplit(new String[] { "És d’abans-d’ahir! ", "Bé, bé." });
+ testSplit(new String[] { "Què vols dir? ", "Ja ho tinc!" });
+ testSplit(new String[] { "Ja ho tinc! ", "Què vols dir?" });
+ testSplit(new String[] { "Us explicaré com va anar: ", "»La
Maria va engegar el cotxe" });
+
// Initials
testSplit(new String[] { "A l'atenció d'A. Comes." });
testSplit(new String[] { "A l'atenció d'À. Comes." });
@@ -48,16 +61,12 @@
testSplit(new String[] { "Viu al núm. 24 del carrer de l'Hort."
});
testSplit(new String[] { "El Dr. Joan no vindrà." });
testSplit(new String[] { "Distingit Sr. Joan," });
- testSplit(new String[] { "Molt Hble. Sr. President" });
-
- // A problem solved
- testSplit(new String[] { "El pou d'Avall. ", "És bonic." });
- testSplit(new String[] { "El pou d’Avall. ", "És bonic." });
- testSplit(new String[] { "Ell viu a l'u. ", "Jo al dos." });
+ testSplit(new String[] { "Molt Hble. Sr. President" });
- //Unsolved problem
- //testSplit(new String[] { "–La vols més fina, l'euga? ",
"Mira-te-la, fill meu, l'euga." });
+ // Exception to abbreviations
+ testSplit(new String[] { "Ell és el número u. ", "Jo el dos."
});
+
}
private void testSplit(final String[] sentences) {
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Master Java SE, Java EE, Eclipse, Spring, Hibernate, JavaScript, jQuery
and much more. Keep your Java skills current with LearnJavaNow -
200+ hours of step-by-step video tutorials by Java experts.
SALE $49.99 this month only -- learn more at:
http://p.sf.net/sfu/learnmore_122612
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits