Revision: 9692
http://languagetool.svn.sourceforge.net/languagetool/?rev=9692&view=rev
Author: jaumeortola
Date: 2013-03-15 11:26:25 +0000 (Fri, 15 Mar 2013)
Log Message:
-----------
[ca] Avoid tagging lowercase words with tags for uppercase words.
Fixes in tagger dictionary.
New multiwords.
Modified Paths:
--------------
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/tagging/BaseTagger.java
trunk/languagetool/languagetool-language-modules/ca/src/main/java/org/languagetool/tagging/ca/CatalanTagger.java
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan.dict
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan_synth.dict
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan_tags.txt
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/multiwords.txt
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/rules/ca/grammar.xml
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/MorfologikCatalanSpellerRuleTest.java
Modified:
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/tagging/BaseTagger.java
===================================================================
---
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/tagging/BaseTagger.java
2013-03-14 23:41:04 UTC (rev 9691)
+++
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/tagging/BaseTagger.java
2013-03-15 11:26:25 UTC (rev 9692)
@@ -43,6 +43,7 @@
private IStemmer dictLookup;
private Locale conversionLocale = Locale.getDefault();
+ boolean tagLowercaseWithUppercase = true;
/**
* Get the filename, e.g., <tt>/resource/fr/french.dict</tt>.
@@ -84,7 +85,7 @@
//uppercase
if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
- if (isLowercase) {
+ if (tagLowercaseWithUppercase && isLowercase) {
upperTaggerTokens = asAnalyzedTokenList(word,
dictLookup.lookup(StringTools.uppercaseFirstChar(word)));
if (!upperTaggerTokens.isEmpty()) {
@@ -150,5 +151,9 @@
public AnalyzedToken createToken(String token, String posTag) {
return new AnalyzedToken(token, posTag, null);
}
+
+ public void dontTagLowercaseWithUppercase() {
+ tagLowercaseWithUppercase=false;
+ }
}
Modified:
trunk/languagetool/languagetool-language-modules/ca/src/main/java/org/languagetool/tagging/ca/CatalanTagger.java
===================================================================
---
trunk/languagetool/languagetool-language-modules/ca/src/main/java/org/languagetool/tagging/ca/CatalanTagger.java
2013-03-14 23:41:04 UTC (rev 9691)
+++
trunk/languagetool/languagetool-language-modules/ca/src/main/java/org/languagetool/tagging/ca/CatalanTagger.java
2013-03-15 11:26:25 UTC (rev 9692)
@@ -20,6 +20,7 @@
import java.io.IOException;
import java.net.URL;
+import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
@@ -28,8 +29,11 @@
import morfologik.stemming.IStemmer;
import morfologik.stemming.WordData;
+import org.languagetool.AnalyzedToken;
+import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.tagging.BaseTagger;
+import org.languagetool.tools.StringTools;
/**
* Catalan Tagger
@@ -52,6 +56,7 @@
public CatalanTagger() {
super();
setLocale(new Locale("ca"));
+ this.dontTagLowercaseWithUppercase();
}
public boolean existsWord(String word) throws IOException {
@@ -71,4 +76,8 @@
}
return true;
}
+
+
+
+
}
Modified:
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan.dict
===================================================================
(Binary files differ)
Modified:
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan_synth.dict
===================================================================
(Binary files differ)
Modified:
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan_tags.txt
===================================================================
---
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan_tags.txt
2013-03-14 23:41:04 UTC (rev 9691)
+++
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan_tags.txt
2013-03-15 11:26:25 UTC (rev 9692)
@@ -97,6 +97,7 @@
NPFSO00
NPFSSP0
NPMNG00
+NPMNSP0
NPMPG00
NPMPO00
NPMPSP0
Modified:
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
===================================================================
---
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
2013-03-14 23:41:04 UTC (rev 9691)
+++
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
2013-03-15 11:26:25 UTC (rev 9692)
@@ -737,28 +737,6 @@
</pattern>
<disambig action="unify"/>
</rule>
- </rulegroup>
- <rulegroup id="lo_admissible" name="Tirant lo Blanc">
- <rule>
- <pattern case_sensitive="yes">
- <token>Tirant</token>
- <marker>
- <token>lo</token>
- </marker>
- <token>Blanc</token>
- </pattern>
- <disambig action="immunize"/>
- </rule>
- <rule>
- <pattern case_sensitive="yes">
- <marker>
- <token>Lo</token>
- </marker>
- <token skip="1">Rat</token>
- <token>Penat</token>
- </pattern>
- <disambig action="immunize"/>
- </rule>
</rulegroup>
<rulegroup id="interjeccions" name="interjeccions">
<rule>
@@ -8395,4 +8373,26 @@
</pattern>
<disambig action="add"><wd pos="_possible_nompropi"/></disambig>
</rule>
-</rules>
\ No newline at end of file
+ <rulegroup id="lo_admissible" name="Tirant lo Blanc">
+ <rule>
+ <pattern case_sensitive="yes">
+ <token>Tirant</token>
+ <marker>
+ <token>lo</token>
+ </marker>
+ <token>Blanc</token>
+ </pattern>
+ <disambig action="immunize"/>
+ </rule>
+ <rule>
+ <pattern case_sensitive="yes">
+ <marker>
+ <token>Lo</token>
+ </marker>
+ <token skip="1">Rat</token>
+ <token>Penat</token>
+ </pattern>
+ <disambig action="immunize"/>
+ </rule>
+ </rulegroup>
+</rules>
Modified:
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/multiwords.txt
===================================================================
---
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/multiwords.txt
2013-03-14 23:41:04 UTC (rev 9691)
+++
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/multiwords.txt
2013-03-15 11:26:25 UTC (rev 9692)
@@ -55,6 +55,7 @@
A diferència de LOC_PREP
A dojo LOC_ADV
A dretcient LOC_ADV
+A esclatacor LOC_ADV
A excepció d' LOC_PREP
A excepció de LOC_PREP
A falta d' LOC_PREP
@@ -63,6 +64,10 @@
A favor d' LOC_PREP
A força d' LOC_PREP
A força de LOC_PREP
+A forfollons LOC_ADV
+A gatamèus LOC_ADV
+A granel LOC_ADV
+A gratcient LOC_ADV
A guisa d' LOC_PREP
A guisa de LOC_PREP
A l caient d' LOC_PREP
@@ -131,6 +136,8 @@
A la doba LOC_ADJ
A la llarga LOC_ADV
A la claror de LOC_PREP
+A la funerala LOC_ADV
+A la gatzoneta LOC_ADV
A la mercè d' LOC_PREP
A la mercè de LOC_PREP
A la recerca d' LOC_PREP
@@ -140,6 +147,8 @@
A la vora de LOC_PREP
A les envistes d' LOC_PREP
A les envistes de LOC_PREP
+A les hurtes LOC_ADV
+A lloure LOC_ADV
A manera d' LOC_PREP
A manera de LOC_PREP
A mercè d' LOC_PREP
@@ -181,6 +190,7 @@
A última hora LOC_ADV
Abans d' LOC_PREP
Abans de LOC_PREP
+Ad hominem LOC_ADJ
Així mateix LOC_CONJ
Això no obstant LOC_ADV
Al capdavall LOC_ADV
@@ -241,6 +251,7 @@
D' esquerres LOC_ADV
D' esquitllada LOC_ADV
D' esquitllentes LOC_ADV
+D' esquitllèbit LOC_ADV
D' estar per casa LOC_ADV
D' estira i arronsa LOC_ADV
D' estranquis LOC_ADV
@@ -279,6 +290,11 @@
De debò LOC_ADV
De dretes LOC_ADV
De facto LOC_ADV
+De folondres LOC_ADV
+De futris LOC_ADV
+De gaidó LOC_ADV
+De gairell LOC_ADV
+De genollons LOC_ADV
De gom a gom LOC_ADV
De iure LOC_ADV
De l tot LOC_ADV
@@ -433,13 +449,20 @@
Fins ara LOC_ADV
Fins avui LOC_ADV
Fins i tot LOC_ADV
+Foie gras NCMS000
Front a front LOC_ADV
Futbol sala NCMS000
Gens ni mica LOC_ADV
Gran Bretanya NPFSG00
Gràcies a LOC_PREP
+Grosso modo LOC_ADV
Honoris causa LOC_ADV
In extremis LOC_ADV
+In extenso LOC_ADV
+In fraganti LOC_ADV
+In vitro LOC_ADV
+In situ LOC_ADV
+Ipso facto LOC_ADV
Junt amb LOC_PREP
Juntament amb LOC_PREP
Llevat d' LOC_PREP
@@ -599,6 +622,7 @@
a diferència de LOC_PREP
a dojo LOC_ADV
a dretcient LOC_ADV
+a esclatacor LOC_ADV
a excepció d' LOC_PREP
a excepció de LOC_PREP
a falta d' LOC_PREP
@@ -607,6 +631,10 @@
a favor d' LOC_PREP
a força d' LOC_PREP
a força de LOC_PREP
+a forfollons LOC_ADV
+a gatamèus LOC_ADV
+a granel LOC_ADV
+a gratcient LOC_ADV
a guisa d' LOC_PREP
a guisa de LOC_PREP
a l caient d' LOC_PREP
@@ -673,6 +701,8 @@
a la biorxa LOC_ADV
a la claror d' LOC_PREP
a la claror de LOC_PREP
+a la funerala LOC_ADV
+a la gatzoneta LOC_ADV
a la doba LOC_ADJ
a la llarga LOC_ADV
a la mercè d' LOC_PREP
@@ -684,6 +714,8 @@
a la vora de LOC_PREP
a les envistes d' LOC_PREP
a les envistes de LOC_PREP
+a les hurtes LOC_ADV
+a lloure LOC_ADV
a manera d' LOC_PREP
a manera de LOC_PREP
a mercè d' LOC_PREP
@@ -725,6 +757,7 @@
a última hora LOC_ADV
abans d' LOC_PREP
abans de LOC_PREP
+ad hominem LOC_ADJ
així mateix LOC_CONJ
això no obstant LOC_ADV
al capdavall LOC_ADV
@@ -784,6 +817,7 @@
d' esquerres LOC_ADV
d' esquitllada LOC_ADV
d' esquitllentes LOC_ADV
+d' esquitllèbit LOC_ADV
d' estar per casa LOC_ADV
d' estira i arronsa LOC_ADV
d' estranquis LOC_ADV
@@ -823,6 +857,11 @@
de dretes LOC_ADV
de facto LOC_ADV
de fiar LOC_ADJ
+de folondres LOC_ADV
+de futris LOC_ADV
+de gaidó LOC_ADV
+de gairell LOC_ADV
+de genollons LOC_ADV
de gom a gom LOC_ADV
de iure LOC_ADV
de l tot LOC_ADV
@@ -979,14 +1018,21 @@
fins ara LOC_ADV
fins avui LOC_ADV
fins i tot LOC_ADV
+foie gras NCMS000
front a front LOC_ADV
futbol sala NCMS000
gens ni mica LOC_ADV
gràcies a LOC_PREP
+grosso modo LOC_ADV
hagudes i per haver AQ0FP0
haguts i per haver AQ0MP0
honoris causa LOC_ADV
in extremis LOC_ADV
+in extenso LOC_ADV
+in fraganti LOC_ADV
+in vitro LOC_ADV
+in situ LOC_ADV
+ipso facto LOC_ADV
junt amb LOC_PREP
juntament amb LOC_PREP
llevat d' LOC_PREP
Modified:
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/rules/ca/grammar.xml
===================================================================
---
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/rules/ca/grammar.xml
2013-03-14 23:41:04 UTC (rev 9691)
+++
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/rules/ca/grammar.xml
2013-03-15 11:26:25 UTC (rev 9692)
@@ -6097,7 +6097,7 @@
<pattern>
<marker>
<token>de</token>
- <token regexp="yes">1|11</token>
+ <token regexp="yes">(1|11)%?</token>
</marker>
<token><exception postag="decimal_comma"/></token>
</pattern>
@@ -6113,7 +6113,7 @@
<pattern>
<marker>
<token>el</token>
- <token regexp="yes">1|11</token>
+ <token regexp="yes">(1|11)%?</token>
</marker>
<token><exception postag="decimal_comma"/></token>
</pattern>
@@ -6122,6 +6122,7 @@
<short>Error ortogràfic</short>
<example type="incorrect" correction="l'1"><marker>el
1</marker> de febrer</example>
<example type="incorrect" correction="l'1"><marker>el
1</marker>, el 2 i el 3 de febrer</example>
+ <example type="incorrect" correction="L'11%"><marker>El
11%</marker> dels assistents.</example>
<example type="correct">el 1.400</example>
</rule>
<rule>
@@ -6129,7 +6130,7 @@
<marker>
<token regexp="yes">a|de</token>
<token spacebefore="no">l</token>
- <token regexp="yes">1|11</token>
+ <token regexp="yes">(1|11)%?</token>
</marker>
<token><exception postag="decimal_comma"/></token>
</pattern>
@@ -6145,7 +6146,7 @@
<marker>
<token>pe</token>
<token spacebefore="no">l</token>
- <token regexp="yes">1|11</token>
+ <token regexp="yes">(1|11)%?</token>
</marker>
<token><exception postag="decimal_comma"/></token>
</pattern>
@@ -9207,6 +9208,14 @@
<example type="correct">Lo Rat Penat</example>
<example type="correct">Lo Rat-Penat</example>
</rule>
+ <rule id="LOS" name="los">
+ <pattern>
+ <token>los</token>
+ </pattern>
+ <message>¿Volíeu dir <suggestion>els</suggestion>?</message>
+ <example type="incorrect" correction="els">Són
<marker>los</marker> millors.</example>
+ <example type="correct">Són els millors.</example>
+ </rule>
</rulegroup>
</category>
<category name="D1) Revisions opcionals">
@@ -10754,10 +10763,10 @@
<suggestion><match no="1" postag="V.([ISNGP].*)"
postag_regexp="yes" postag_replace="VM$1">anar</match></suggestion>
<suggestion><match no="1" postag="V.([ISNGP].*)"
postag_regexp="yes" postag_replace="VS$1">ser</match></suggestion>
<short>Expressió incorrecta.</short>
- <example type="incorrect" correction="aniran|seran">Les despeses
<marker>correran</marker> a càrrec meu.</example>
+ <example type="incorrect" correction="aniran|iran|seran">Les
despeses <marker>correran</marker> a càrrec meu.</example>
<example type="correct">Les despeses seran a càrrec meu.</example>
</rule>
- <rule id="ENMIG" name="*en mig de / enmig de">
+ <!-- <rule id="ENMIG" name="*en mig de / enmig de">
<pattern>
<marker>
<token>en</token>
@@ -10771,7 +10780,7 @@
<example type="correct">En mig de les timonedes.</example>
<example type="correct">en mig país</example>
<example type="correct">Enmig de les timonedes.</example>
- </rule>
+ </rule> -->
<rule id="CASC_ANTIC" name="casc antic">
<pattern>
<token inflected="yes">casc</token>
Modified:
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/MorfologikCatalanSpellerRuleTest.java
===================================================================
---
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/MorfologikCatalanSpellerRuleTest.java
2013-03-14 23:41:04 UTC (rev 9691)
+++
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/MorfologikCatalanSpellerRuleTest.java
2013-03-15 11:26:25 UTC (rev 9692)
@@ -68,6 +68,13 @@
//incorrect sentences:
+ matches = rule.match(langTool.getAnalyzedSentence("joan"));
+ // check match positions:
+ assertEquals(1, matches.length);
+ assertEquals(0, matches[0].getFromPos());
+ assertEquals(4, matches[0].getToPos());
+ assertEquals("Joan", matches[0].getSuggestedReplacements().get(0));
+
matches = rule.match(langTool.getAnalyzedSentence("abatusats"));
// check match positions:
assertEquals(1, matches.length);
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Everyone hates slow websites. So do we.
Make your web apps faster with AppDynamics
Download AppDynamics Lite for free today:
http://p.sf.net/sfu/appdyn_d2d_mar
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits