Revision: 8624
http://languagetool.svn.sourceforge.net/languagetool/?rev=8624&view=rev
Author: jaumeortola
Date: 2012-12-22 09:08:50 +0000 (Sat, 22 Dec 2012)
Log Message:
-----------
[ca] Don't apply apostrophation rules to non-catalan words and unknown proper
nouns. More proper nouns are needed in the tagger dictionary.
Don't separate expressions like A-7, N-340 (road names).
Modified Paths:
--------------
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
trunk/JLanguageTool/src/main/resources/org/languagetool/rules/ca/grammar.xml
Modified:
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
===================================================================
---
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
2012-12-22 07:10:02 UTC (rev 8623)
+++
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
2012-12-22 09:08:50 UTC (rev 8624)
@@ -88,7 +88,7 @@
final List<String> l = new ArrayList<String>();
final StringTokenizer st = new
StringTokenizer(text.replaceAll("([\\p{L}])['’]([\\p{L}])", "$1##CA_APOS##$2")
.replaceAll("([\\p{L}])-([\\p{L}])-([\\p{L}])",
"$1##CA_HYPHEN##$2##CA_HYPHEN##$3") //it's necessary for words like
"vint-i-quatre"
- .replaceAll("([\\p{Ll}])-([\\p{Ll}])",
"$1##CA_HYPHEN##$2"),
+ .replaceAll("([\\p{L}])-([\\p{Ll}\\d])",
"$1##CA_HYPHEN##$2"),
"\u0020\u00A0\u115f\u1160\u1680"
+
"\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+
"\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f"
Modified:
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
===================================================================
---
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
2012-12-22 07:10:02 UTC (rev 8623)
+++
trunk/JLanguageTool/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
2012-12-22 09:08:50 UTC (rev 8624)
@@ -90,13 +90,7 @@
<token postag="RG"
regexp="yes">.+ment|molt|poc|tant|tan|aleshores|alhora|almanco|almenys|aviat|bastant|ben|encara|força|gaire|gairebé|gens|inclús|ja|llavors|mai|massa|menys|mig|millor|més|no|només|pas|pitjor|potser|prompte|prou|quasi|quelcom|sempre|sobretot|sols|suara|també|tampoc|tanmateix|tostemps|tot|tothora|àdhuc</token>
</pattern>
<disambig action="add"><wd pos="RG_anteposat"></wd></disambig>
- </rule>
- <rule id="MOT_FORASTER" name="paraula no catalana">
- <pattern>
- <token postag="UNKNOWN"
regexp="yes">.*ñ.*|.*á.*|.+ón|.+ía|.+íes|.+ch.+</token>
- </pattern>
- <disambig action="add"><wd pos="mot_foraster"></wd></disambig>
- </rule>
+ </rule>
<rulegroup id="CONTRACCIONS" name="contraccions">
<rule>
<pattern>
@@ -572,7 +566,7 @@
</pattern>
<disambig action="add"><wd pos="NPCS000"></wd></disambig>
</rule>
- <rule>
+ <!-- <rule>
<pattern>
<token postag="D[^R].*" postag_regexp="yes" negate_pos="yes"/>
<marker>
@@ -580,7 +574,7 @@
</marker>
</pattern>
<disambig action="add"><wd pos="NPCN000"></wd></disambig>
- </rule>
+ </rule> -->
<rule id="nom_persona_o_geogràfic" name="nom de persona o geogràfic: a
l'Àfrica?">
<pattern>
<token skip="1">a</token>
@@ -740,14 +734,13 @@
</rule>
<rule id="noNom2" name="FA LA no nom">
<pattern>
+ <token><exception postag="D[DAI]0MS0" postag_regexp="yes"
regexp="yes">el|un|aquest|algun</exception></token>
<marker>
<token regexp="yes">fa|la</token>
</marker>
<token><exception
regexp="yes">menor|major|sostingut|bemoll|diesi|becaire</exception></token>
</pattern>
- <disambig>
- <match no="1" postag_regexp="yes" postag="[^N].*"></match>
- </disambig>
+ <disambig action="filter" postag="[^N].*"/>
</rule>
<rule id="nAdj" name="no adjectiu">
<!-- paraules comunes que no solen ser adjectiu gairebé mai -->
@@ -7467,6 +7460,20 @@
<disambig action="add"><wd pos="hac_aspirada"></wd></disambig>
</rule>
</rulegroup>
+ <rulegroup id="MOT_FORASTER" name="paraula no catalana">
+ <rule>
+ <pattern>
+ <token postag="UNKNOWN"
regexp="yes">.*ñ.*|.*á.*|.+ón|.+ía|.+íes|.+ch.+</token>
+ </pattern>
+ <disambig action="add"><wd pos="mot_foraster"></wd></disambig>
+ </rule>
+ <rule>
+ <pattern>
+ <token>agua</token>
+ </pattern>
+ <disambig action="add"><wd pos="mot_foraster"></wd></disambig>
+ </rule>
+ </rulegroup>
<rulegroup id="verb_reflexiu" name="verb acomopanyat de pronom reflexiu">
<rule>
<pattern>
Modified:
trunk/JLanguageTool/src/main/resources/org/languagetool/rules/ca/grammar.xml
===================================================================
---
trunk/JLanguageTool/src/main/resources/org/languagetool/rules/ca/grammar.xml
2012-12-22 07:10:02 UTC (rev 8623)
+++
trunk/JLanguageTool/src/main/resources/org/languagetool/rules/ca/grammar.xml
2012-12-22 09:08:50 UTC (rev 8624)
@@ -5702,7 +5702,7 @@
<rule>
<pattern>
<token>de</token>
- <token regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóòu].*|el|els|a|e|i|o|u|efa|ef|efe|hac|ela|ele|ema|eme|em|ena|ene|en|erra|erre|er|essa|esse|es|ics|iure</exception><exception
inflected="yes">ordinal</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN" postag_regexp="yes"/></token>
+ <token regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóòu].*|hosts?|el|els|a|e|i|o|u|efa|ef|efe|hac|ela|ele|ema|eme|em|ena|ene|erra|erre|essa|esse|ics|iure</exception><exception
inflected="yes">ordinal</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN|V.[MSI].*|NP..000"
postag_regexp="yes"/></token>
</pattern>
<message>Cal escriure: <suggestion>d'<match
no="2"></match></suggestion>.</message>
<short>Error ortogràfic</short>
@@ -5712,12 +5712,12 @@
<example type="correct">de Hubble</example>
<example type="correct">de atletismo</example>
<example type="correct">de Iugoslàvia</example>
-
+ <example type="correct">el 23 de agosto de 1984</example>
</rule>
<rule>
<pattern>
<token>el</token>
- <token regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóòu].+|o|i</exception><exception
inflected="yes">ordinal</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN" postag_regexp="yes"/></token>
+ <token regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóòu].+|o|i|host</exception><exception
inflected="yes">ordinal</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN|NP..000" postag_regexp="yes"/></token>
</pattern>
<message>Cal escriure: <suggestion>l'<match
no="2"></match></suggestion>.</message>
<short>Error ortogràfic</short>
@@ -5732,7 +5732,7 @@
<pattern>
<marker>
<token postag="DA0MS0">en</token>
- <token postag="NPMSSP0"
regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóòu].+</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN" postag_regexp="yes"/></token>
+ <token postag="NPMSSP0"
regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóòu].+</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN|NP..000" postag_regexp="yes"/></token>
</marker>
<token postag="V...[12].." postag_regexp="yes"
negate_pos="yes" />
</pattern>
@@ -5747,7 +5747,7 @@
<pattern>
<token>de</token>
<token spacebefore="no">l</token>
- <token regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóòu].+|o|i</exception><exception
inflected="yes">ordinal</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN" postag_regexp="yes"/></token>
+ <token regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóòu].+|o|i|host</exception><exception
inflected="yes">ordinal</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN|NP..000" postag_regexp="yes"/></token>
</pattern>
<message>Cal escriure: <suggestion>de l'<match
no="3"></match></suggestion>.</message>
<short>Error ortogràfic</short>
@@ -5761,7 +5761,7 @@
<pattern>
<token>a</token>
<token spacebefore="no">l</token>
- <token regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóòu].+|o|i</exception><exception
inflected="yes">ordinal</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN" postag_regexp="yes"/></token>
+ <token regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóòu].+|o|i|host</exception><exception
inflected="yes">ordinal</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN|NP..000" postag_regexp="yes"/></token>
</pattern>
<message>Cal escriure: <suggestion>a l'<match
no="3"></match></suggestion>.</message>
<short>Error ortogràfic</short>
@@ -5775,7 +5775,7 @@
<pattern>
<token>pe</token>
<token spacebefore="no">l</token>
- <token regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóòu].+|o|i</exception><exception
inflected="yes">ordinal</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN" postag_regexp="yes"/></token>
+ <token regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóòu].+|o|i|host</exception><exception
inflected="yes">ordinal</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN|NP..000" postag_regexp="yes"/></token>
</pattern>
<message>Cal escriure: <suggestion>per l'<match
no="3"></match></suggestion>.</message>
<short>Error ortogràfic</short>
@@ -5845,47 +5845,49 @@
<example type="correct">d'ionització</example>
<example type="correct">ve d'Iowa.</example>
</rule>
+ <rule>
+ <pattern>
+ <token>l'</token>
+ <token postag="N.M.*|A..M.*" postag_regexp="yes"
regexp="yes">h?[ui][aeoiàèéóòu].*<exception
regexp="yes">ió|uix.+</exception></token>
+ </pattern>
+ <message>Cal escriure: <suggestion>el <match
no="2"></match></suggestion>.</message>
+ <short>Error ortogràfic</short>
+ <example type="incorrect" correction="el
iogurt"><marker>l'iogurt</marker></example>
+ <example type="incorrect" correction="el
iugoslau"><marker>l'iugoslau</marker></example>
+ <example type="correct">l'uixer</example>
+ <example type="correct">l'Uixó</example>
+ </rule>
</rulegroup>
<rulegroup id="L_D_NOM_LLETRA" name="apostrofació: no davant de nom de
lletra">
<rule>
<pattern>
<token>d'</token>
- <token
regexp="yes">a|e|i|o|efa|ef|efe|hac|ela|el|ele|ema|eme|em|ena|ene|en|erra|erre|er|essa|esse|es|ics</token>
+ <token
regexp="yes">a|e|i|o|efa|ef|efe|hac|ela|ele|ema|eme|em|ena|ene|erra|erre|er|essa|esse|ics<exception
regexp="yes">(?-i)I</exception></token>
</pattern>
<message>Els noms de lletra no s'apostrofen. Cal escriure:
<suggestion>de <match no="2"></match></suggestion>.</message>
<short>Error ortogràfic</short>
<example type="incorrect">Darrere
<marker>d'essa</marker></example>
<example type="correct">Darrere de essa</example>
<example type="correct">d'u a cinc</example>
+ <example type="correct">La casa d'en Joan</example>
</rule>
<rule>
<pattern>
<token>l'</token>
- <token
regexp="yes">a|e|i|o|efa|ef|efe|hac|ela|el|ele|ema|eme|em|ena|ene|en|erra|erre|er|essa|esse|es|ics</token>
+ <token
regexp="yes">a|e|i|o|efa|ef|efe|hac|ela|ele|ema|eme|em|ena|ene|er|essa|esse|ics<exception
regexp="yes">(?-i)I</exception></token>
</pattern>
<message>Els noms de lletra no s'apostrofen. Cal escriure:
<suggestion>la <match no="2"></match></suggestion>.</message>
<short>Error ortogràfic</short>
<example type="incorrect">Darrere de
<marker>l'essa</marker></example>
<example type="correct">Darrere la essa</example>
<example type="correct">de l'u al cinc</example>
+ <example type="correct">Enllaça amb l'A-7</example>
</rule>
</rulegroup>
- <rulegroup id="LA_NA_NOM_FEMENI" name="apostrofació: la/na + femení">
+ <rulegroup id="LA_NA_NOM_FEMENI" name="apostrofació: la/na + femení">
<rule>
<pattern>
<token>l'</token>
- <token postag="N.M.*|A..M.*" postag_regexp="yes"
regexp="yes">h?[ui][aeoiàèéóòu].*<exception
regexp="yes">ió|uix.+</exception></token>
- </pattern>
- <message>Cal escriure: <suggestion>el <match
no="2"></match></suggestion>.</message>
- <short>Error ortogràfic</short>
- <example type="incorrect" correction="el
iogurt"><marker>l'iogurt</marker></example>
- <example type="incorrect" correction="el
iugoslau"><marker>l'iugoslau</marker></example>
- <example type="correct">l'uixer</example>
- <example type="correct">l'Uixó</example>
- </rule>
- <rule>
- <pattern>
- <token>l'</token>
<token postag="N.F.*|A..F.*|_GN_F." postag_regexp="yes"
regexp="yes">h?[ui].+|host<exception
regexp="yes">hidra|hifa|hulla|ulna|umbra|una|ungla|unça|upa|urbs|urna|urpa|idus|illa|inca|iva</exception><exception
postag="_GN_M.*" postag_regexp="yes" /></token>
</pattern>
<message>Cal escriure: <suggestion>la <match
no="2"></match></suggestion>.</message>
@@ -5928,7 +5930,7 @@
<rule>
<pattern>
<token>la</token>
- <token
regexp="yes">h?[aeoàèéíòóú].*|h?[ui][^aeiouàèéíòóúüï]+[aeiou][ns]?|urbs<exception
regexp="yes">host|ira|inxa|[aeiou]|efa|hac|ela|ema|ena|erra|essa|ics|una</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN" postag_regexp="yes"/><exception
regexp="yes">(?-i)Haia</exception></token>
+ <token
regexp="yes">h?[aeoàèéíòóú].*|h?[ui][^aeiouàèéíòóúüï]+[aeiou][ns]?|urbs<exception
regexp="yes">host|ira|inxa|[aeiou]|efa|hac|ela|ema|en|ena|ene|er|erra|erre|essa|ics|una</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN|NP..000"
postag_regexp="yes"/><exception regexp="yes">(?-i)Haia</exception></token>
</pattern>
<message>Cal escriure: <suggestion>l'<match
no="2"></match></suggestion>.</message>
<short>Error ortogràfic</short>
@@ -5955,7 +5957,7 @@
<rule>
<pattern>
<token>na</token>
- <token postag="N.FS.*" postag_regexp="yes"
regexp="yes">h?[aeoàèéíòóú].*|h?[ui][^aeiouàèéíòóúüï]+[aeiou][ns]?|urbs<exception
regexp="yes">host|ira|inxa</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN" postag_regexp="yes"/></token>
+ <token postag="N.FS.*" postag_regexp="yes"
regexp="yes">h?[aeoàèéíòóú].*|h?[ui][^aeiouàèéíòóúüï]+[aeiou][ns]?|urbs<exception
regexp="yes">host|ira|inxa</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN|NP..000" postag_regexp="yes"/></token>
</pattern>
<message>Cal escriure: <suggestion>n'<match
no="2"></match></suggestion>.</message>
<short>Error ortogràfic</short>
@@ -5968,7 +5970,7 @@
<rule>
<pattern>
<token>cal</token>
- <token postag="NPMSSP0"
regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóò].+</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN" postag_regexp="yes"/></token>
+ <token postag="NPMSSP0"
regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóò].+</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN|NP..000" postag_regexp="yes"/></token>
</pattern>
<message>Cal escriure: <suggestion>ca l'<match
no="2"></match></suggestion>.</message>
<short>Error d'apostrofació</short>
@@ -5978,7 +5980,7 @@
<rule>
<pattern>
<token postag="contraccio">cal</token>
- <token postag="N.[MC][SN].*|AQ0[MC][SN]0"
postag_regexp="yes" regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóò].+</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN" postag_regexp="yes"/></token>
+ <token postag="N.[MC][SN].*|AQ0[MC][SN]0"
postag_regexp="yes" regexp="yes">h?[aeiouàèéíòóú].*<exception
regexp="yes">h?[ui][aeioàèéóò].+</exception><exception
postag="hac_aspirada|mot_foraster|UNKNOWN|NP..000" postag_regexp="yes"/></token>
</pattern>
<message>Cal escriure: <suggestion>ca l'<match
no="2"></match></suggestion>.</message>
<short>Error d'apostrofació</short>
@@ -12706,7 +12708,7 @@
<url>http://esadir.cat/lexic/entrades/assentarse</url>
<example type="incorrect" correction="asseure">Es van
<marker>assentar</marker> a taula.</example>
<example type="incorrect" correction="assegué">Joan
s'<marker>assentà</marker> a la cadira.</example>
- <example type="correct">Els israelians s'assentaren a
Cisjordània.</example>
+ <!-- <example type="correct">Els israelians s'assentaren a
Cisjordània.</example> -->
<example type="correct">S'assentaren en aquell territori.</example>
<example type="correct">Volien que assentéssim les bases del
futur.</example>
<example type="correct">S'assentaren a Argentina.</example>
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
LogMeIn Rescue: Anywhere, Anytime Remote support for IT. Free Trial
Remotely access PCs and mobile devices and provide instant support
Improve your efficiency, and focus on delivering more value-add services
Discover what IT Professionals Know. Rescue delivers
http://p.sf.net/sfu/logmein_12329d2d
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits