Revision: 8953
http://languagetool.svn.sourceforge.net/languagetool/?rev=8953&view=rev
Author: jaumeortola
Date: 2013-01-12 13:52:43 +0000 (Sat, 12 Jan 2013)
Log Message:
-----------
[ca] Improved sentence tokenization. Changed suggestions in PHRASE_REPETITION
rule.
Modified Paths:
--------------
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/segment.srx
trunk/JLanguageTool/src/main/resources/org/languagetool/rules/ca/grammar.xml
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanSentenceTokenizerTest.java
Modified:
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
===================================================================
---
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
2013-01-12 13:05:51 UTC (rev 8952)
+++
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
2013-01-12 13:52:43 UTC (rev 8953)
@@ -33,8 +33,7 @@
<token postag="(P.|V.{3})[30].*" postag_regexp="yes"></token>
</equivalence>
</unification>
- <rulegroup id="PUNTUACIO" name="signes de puntuació">
-
+ <rulegroup id="PUNTUACIO" name="signes de puntuació">
<rule>
<pattern>
<token regexp="yes">[.;:!?—–…()\[\]\-]</token>
Modified:
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/segment.srx
===================================================================
---
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/segment.srx
2013-01-12 13:05:51 UTC (rev 8952)
+++
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/segment.srx
2013-01-12 13:52:43 UTC (rev 8953)
@@ -4249,9 +4249,18 @@
<languagerule languagerulename="Catalan">
<!-- initials: A. C. Jones -->
<rule break="no">
-<beforebreak>\b[A-Z]\.\s</beforebreak>
+<beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.\s</beforebreak>
<afterbreak></afterbreak>
</rule>
+<!-- Solve problem with apostrophe before last word of sentence (why this
happens?)-->
+<rule break="yes">
+<beforebreak>\b[ldmtsnLDMTSN]'[\p{L}]{2,}\.\s</beforebreak>
+<afterbreak></afterbreak>
+</rule>
+<rule break="yes">
+<beforebreak>\b[ldmtsnLDMTSN]'[\p{Ll}]\.\s</beforebreak>
+<afterbreak></afterbreak>
+</rule>
<!-- Abbreviations that cannot finish sentences-->
<rule break="no">
<beforebreak>\b(C|Dr|Dra|E|Emm|Emma|Excm|Excma|H|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|M|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|alt|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|art|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cant|cap|cartogr|cast|cat|catedr|catol|cert|cf|cia|cin|cint|circul|cit|climat|col|col·l|com|compt|cons|constr|contr|conv|cor|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dc|dept|derog|des|desp|dg|dip|dir|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|etc|ex|exc|exp|exped|ext|f|fac|fca|febr|fra|g|gen|gov|gr|gral|h|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|m|merc|mil·l|màx|mín|n|neg|nom|nov|nre|núm|o|oct|op|p|paq|par|parc|part|pda|pg|pl|pobl|pol|ppda|ppt|pral|pres|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|s|secr|serv|set|sgt|sotsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|u|univ|urb|v|veg|venc|vid|vig|vocab|vol|x|àt|íd)\.\s</beforebreak>
@@ -4287,6 +4296,11 @@
<beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
<afterbreak>\p{Ll}</afterbreak>
</rule>
+<!-- Ellipsis: ... lowercase -->
+<rule break="no">
+<beforebreak>\b(\Q...\E|…)\s</beforebreak>
+<afterbreak>\p{Ll}</afterbreak>
+</rule>
<!-- (enum...) -->
<rule break="no">
<beforebreak>\b(\Q...\E|…)\p{Pe}\s</beforebreak>
Modified:
trunk/JLanguageTool/src/main/resources/org/languagetool/rules/ca/grammar.xml
===================================================================
---
trunk/JLanguageTool/src/main/resources/org/languagetool/rules/ca/grammar.xml
2013-01-12 13:05:51 UTC (rev 8952)
+++
trunk/JLanguageTool/src/main/resources/org/languagetool/rules/ca/grammar.xml
2013-01-12 13:52:43 UTC (rev 8953)
@@ -5552,22 +5552,38 @@
<!-- made more general-->
<rule>
<pattern>
- <token negate_pos="yes" postag="SENT_START"
regexp="yes"><exception>i</exception><exception
postag="allow_repetition">de</exception>[\p{L}·'-]+</token>
- <token
regexp="yes"><exception>i</exception>[\p{L}·'-]+</token>
<marker>
+ <token negate_pos="yes" postag="SENT_START"
regexp="yes"><exception>i</exception><exception
postag="allow_repetition">de</exception>[\p{L}·'-]+</token>
+ <token regexp="yes"
spacebefore="yes"><exception>i</exception>[\p{L}·'-]+</token>
<token><match no="0"/><exception
postag="</LOC_ADV>" regexp="yes">poc|més|pas|bat</exception></token>
<token><match no="1"/><exception
postag="</LOC_ADV>" regexp="yes">poc|més</exception></token>
</marker>
</pattern>
- <message>Elimina la duplicació.
<suggestion></suggestion></message>
+ <message>Elimina la duplicació. <suggestion><match no="1"/>
<match no="2"/></suggestion></message>
<short>Frase duplicada</short>
- <example correction="" type="incorrect">Benvinguts a casa
<marker>a casa</marker> meva.</example>
- <example correction="" type="incorrect">És l'americà
<marker>l'americà</marker>.</example>
+ <example correction="a casa" type="incorrect">Benvinguts
<marker>a casa a casa</marker> meva.</example>
<example type="correct">No volia res més que mirar i mirar i
mirar.</example>
<example type="correct">A més a més, ho va fer a poc a
poc.</example>
<example type="correct">A diferència dels dels
ocells.</example>
<example type="correct">Puja pas a pas a les ruïnes</example>
- </rule>
+ </rule>
+ <rule>
+ <pattern>
+ <marker>
+ <token negate_pos="yes" postag="SENT_START"
regexp="yes"><exception>i</exception><exception
postag="allow_repetition">de</exception>[\p{L}·'-]+</token>
+ <token regexp="yes"
spacebefore="no"><exception>i</exception>[\p{L}·'-]+</token>
+ <token><match no="0"/><exception
postag="</LOC_ADV>" regexp="yes">poc|més|pas|bat</exception></token>
+ <token><match no="1"/><exception
postag="</LOC_ADV>" regexp="yes">poc|més</exception></token>
+ </marker>
+ </pattern>
+ <message>Elimina la duplicació. <suggestion><match
no="1"/><match no="2"/></suggestion></message>
+ <short>Frase duplicada</short>
+ <example correction="l'americà" type="incorrect">És
<marker>l'americà l'americà</marker>.</example>
+ <example type="correct">No volia res més que mirar i mirar i
mirar.</example>
+ <example type="correct">A més a més, ho va fer a poc a
poc.</example>
+ <example type="correct">A diferència dels dels
ocells.</example>
+ <example type="correct">Puja pas a pas a les ruïnes</example>
+ </rule>
</rulegroup>
<rulegroup id="DOS_ARTICLES" name="Dos articles seguits: *la
l'aparició">
<rule>
Modified:
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanSentenceTokenizerTest.java
===================================================================
---
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanSentenceTokenizerTest.java
2013-01-12 13:05:51 UTC (rev 8952)
+++
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanSentenceTokenizerTest.java
2013-01-12 13:52:43 UTC (rev 8953)
@@ -35,10 +35,12 @@
// Initials
testSplit(new String[] { "A l'atenció d'A. Comes." });
+ testSplit(new String[] { "A l'atenció d'À. Comes." });
// Ellipsis
testSplit(new String[] { "Desenganyeu-vos… ",
"L’únic problema seriós de l'home en aquest món
és el de subsistir." });
+ testSplit(new String[] { "és clar… traduir és una feina
endimoniada" });
// Abbreviations
testSplit(new String[] { "Viu al núm. 24 del carrer de l'Hort."
});
@@ -46,6 +48,10 @@
testSplit(new String[] { "Distingit Sr. Joan," });
testSplit(new String[] { "Molt Hble. Sr. President" });
+ // A problem solved
+ testSplit(new String[] { "El pou d'Avall. ", "És bonic." });
+ testSplit(new String[] { "El pou d’Avall. ", "És bonic." });
+ testSplit(new String[] { "Ell viu a l'u. ", "Jo al dos." });
}
private void testSplit(final String[] sentences) {
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Master Visual Studio, SharePoint, SQL, ASP.NET, C# 2012, HTML5, CSS,
MVC, Windows 8 Apps, JavaScript and much more. Keep your skills current
with LearnDevNow - 3,200 step-by-step video tutorials by Microsoft
MVPs and experts. SALE $99.99 this month only -- learn more at:
http://p.sf.net/sfu/learnmore_122912
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits