Revision: 10136
http://sourceforge.net/p/languagetool/code/10136
Author: jaumeortola
Date: 2013-05-23 18:27:31 +0000 (Thu, 23 May 2013)
Log Message:
-----------
[ca] Improve sentence tokenization
Modified Paths:
--------------
trunk/languagetool/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/tokenizers/ca/CatalanSentenceTokenizerTest.java
Modified:
trunk/languagetool/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx
===================================================================
---
trunk/languagetool/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx
2013-05-22 20:35:04 UTC (rev 10135)
+++
trunk/languagetool/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx
2013-05-23 18:27:31 UTC (rev 10136)
@@ -4264,7 +4264,7 @@
</rule>
<!-- Abbreviations that cannot finish sentences-->
<rule break="no"> <!-- removed from abbreviations: u, s -->
-<beforebreak>\b(?iu)(C|Dr|Dra|E|Emm|Emma|Excm|Excma|H|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|M|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|alt|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|art|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cant|cap|cartogr|cast|cat|catedr|catol|cert|cf|cia|cin|cint|circul|cit|climat|col|col·l|com|compt|cons|constr|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dc|dept|derog|des|desp|dg|dip|dir|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|etc|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|fra|g|gen|gov|gr|gral|h|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|m|merc|mil·l|màx|mín|n|neg|nom|nov|nre|núm|o|oct|op|p|paq|par|parc|part|pda|pg|pl|pobl|pol|ppda|ppt|pral|pres|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|set|sgt|sotsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|veg|venc|vid|vig|vocab|vol|vs|x|àt|íd)\.\s</beforebreak>
+<beforebreak>\b(?iu)(C|Dr|Dra|E|Emm|Emma|Excm|Excma|H|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|M|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|alt|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|art|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cant|cap|cartogr|cast|cat|catedr|catol|cert|cf|cia|cin|cint|circul|cit|climat|col|col·l|com|compt|cons|constr|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dc|dept|derog|des|desp|dg|dip|dir|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|etc|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|fra|g|gen|gov|gr|gral|h|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|m|merc|mil·l|màx|mín|n|neg|nom|nov|nre|núm|o|oct|op|p|paq|par|parc|part|pda|pg|pl|pobl|pol|ppda|ppt|pral|pres|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|set|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vol|vs|x|àt|íd)\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
<!-- Abbreviations that can finish sentences -->
@@ -4308,7 +4308,7 @@
</rule>
<!-- (enum...) -->
<rule break="no">
-<beforebreak>\b(\Q...\E|…)[\p{Pe}»]\s</beforebreak>
+<beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”]\s</beforebreak>
<afterbreak>\p{Ll}</afterbreak>
</rule>
<!-- pero ¡ah! no estaba
Modified:
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/tokenizers/ca/CatalanSentenceTokenizerTest.java
===================================================================
---
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/tokenizers/ca/CatalanSentenceTokenizerTest.java
2013-05-22 20:35:04 UTC (rev 10135)
+++
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/tokenizers/ca/CatalanSentenceTokenizerTest.java
2013-05-23 18:27:31 UTC (rev 10136)
@@ -51,6 +51,7 @@
testSplit(new String[] { "Núm. operació 220130000138." });
// Ellipsis
+ testSplit(new String[] { "el vi no és gens propi de monjos, amb tot...\"
vetllant, això sí" });
testSplit(new String[] { "Desenganyeu-vos… ",
"L’únic problema seriós de l'home en aquest món és el de subsistir."
});
testSplit(new String[] { "és clar… traduir és una feina endimoniada" });
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits