Revision: 9692
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=9692&view=rev
Author:   jaumeortola
Date:     2013-03-15 11:26:25 +0000 (Fri, 15 Mar 2013)
Log Message:
-----------
[ca] Avoid tagging lowercase words with tags for uppercase words. 
Fixes in tagger dictionary. 
New multiwords.

Modified Paths:
--------------
    
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/tagging/BaseTagger.java
    
trunk/languagetool/languagetool-language-modules/ca/src/main/java/org/languagetool/tagging/ca/CatalanTagger.java
    
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan.dict
    
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan_synth.dict
    
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan_tags.txt
    
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
    
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/multiwords.txt
    
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/rules/ca/grammar.xml
    
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/MorfologikCatalanSpellerRuleTest.java

Modified: 
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/tagging/BaseTagger.java
===================================================================
--- 
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/tagging/BaseTagger.java
 2013-03-14 23:41:04 UTC (rev 9691)
+++ 
trunk/languagetool/languagetool-core/src/main/java/org/languagetool/tagging/BaseTagger.java
 2013-03-15 11:26:25 UTC (rev 9692)
@@ -43,6 +43,7 @@
 
   private IStemmer dictLookup;
   private Locale conversionLocale = Locale.getDefault();  
+  boolean tagLowercaseWithUppercase = true;
 
   /**
    * Get the filename, e.g., <tt>/resource/fr/french.dict</tt>.
@@ -84,7 +85,7 @@
 
       //uppercase
       if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
-        if (isLowercase) {          
+        if (tagLowercaseWithUppercase && isLowercase) {          
           upperTaggerTokens = asAnalyzedTokenList(word, 
               dictLookup.lookup(StringTools.uppercaseFirstChar(word)));
           if (!upperTaggerTokens.isEmpty()) {
@@ -150,5 +151,9 @@
   public AnalyzedToken createToken(String token, String posTag) {
     return new AnalyzedToken(token, posTag, null);
   }
+  
+  public void dontTagLowercaseWithUppercase() {
+    tagLowercaseWithUppercase=false;
+  }
 
 }

Modified: 
trunk/languagetool/languagetool-language-modules/ca/src/main/java/org/languagetool/tagging/ca/CatalanTagger.java
===================================================================
--- 
trunk/languagetool/languagetool-language-modules/ca/src/main/java/org/languagetool/tagging/ca/CatalanTagger.java
    2013-03-14 23:41:04 UTC (rev 9691)
+++ 
trunk/languagetool/languagetool-language-modules/ca/src/main/java/org/languagetool/tagging/ca/CatalanTagger.java
    2013-03-15 11:26:25 UTC (rev 9692)
@@ -20,6 +20,7 @@
 
 import java.io.IOException;
 import java.net.URL;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 
@@ -28,8 +29,11 @@
 import morfologik.stemming.IStemmer;
 import morfologik.stemming.WordData;
 
+import org.languagetool.AnalyzedToken;
+import org.languagetool.AnalyzedTokenReadings;
 import org.languagetool.JLanguageTool;
 import org.languagetool.tagging.BaseTagger;
+import org.languagetool.tools.StringTools;
 
 /**
  * Catalan Tagger
@@ -52,6 +56,7 @@
        public CatalanTagger() {
                super();
                setLocale(new Locale("ca"));
+               this.dontTagLowercaseWithUppercase();
        }
 
        public boolean existsWord(String word) throws IOException {
@@ -71,4 +76,8 @@
                }
                return true;
        }
+       
+       
+       
+       
 }

Modified: 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan.dict
===================================================================
(Binary files differ)

Modified: 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan_synth.dict
===================================================================
(Binary files differ)

Modified: 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan_tags.txt
===================================================================
--- 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan_tags.txt
        2013-03-14 23:41:04 UTC (rev 9691)
+++ 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/catalan_tags.txt
        2013-03-15 11:26:25 UTC (rev 9692)
@@ -97,6 +97,7 @@
 NPFSO00
 NPFSSP0
 NPMNG00
+NPMNSP0
 NPMPG00
 NPMPO00
 NPMPSP0

Modified: 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
===================================================================
--- 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
      2013-03-14 23:41:04 UTC (rev 9691)
+++ 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/disambiguation.xml
      2013-03-15 11:26:25 UTC (rev 9692)
@@ -737,28 +737,6 @@
             </pattern>
             <disambig action="unify"/>
         </rule>
-    </rulegroup> 
-    <rulegroup id="lo_admissible" name="Tirant lo Blanc">
-        <rule>
-            <pattern case_sensitive="yes">
-                <token>Tirant</token>
-                <marker>
-                    <token>lo</token>
-                </marker>
-                <token>Blanc</token>
-            </pattern>
-            <disambig action="immunize"/>
-        </rule>
-        <rule>
-            <pattern case_sensitive="yes">
-                <marker>
-                    <token>Lo</token>
-                </marker>
-                <token skip="1">Rat</token>
-                <token>Penat</token>
-            </pattern>
-            <disambig action="immunize"/>
-        </rule>
     </rulegroup>    
     <rulegroup id="interjeccions" name="interjeccions">
         <rule>
@@ -8395,4 +8373,26 @@
             </pattern>
             <disambig action="add"><wd pos="_possible_nompropi"/></disambig>
      </rule>
-</rules>
\ No newline at end of file
+     <rulegroup id="lo_admissible" name="Tirant lo Blanc">
+        <rule>
+            <pattern case_sensitive="yes">
+                <token>Tirant</token>
+                <marker>
+                    <token>lo</token>
+                </marker>
+                <token>Blanc</token>
+            </pattern>
+            <disambig action="immunize"/>
+        </rule>
+        <rule>
+            <pattern case_sensitive="yes">
+                <marker>
+                    <token>Lo</token>
+                </marker>
+                <token skip="1">Rat</token>
+                <token>Penat</token>
+            </pattern>
+            <disambig action="immunize"/>
+        </rule>
+    </rulegroup>
+</rules>

Modified: 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/multiwords.txt
===================================================================
--- 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/multiwords.txt
  2013-03-14 23:41:04 UTC (rev 9691)
+++ 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/resource/ca/multiwords.txt
  2013-03-15 11:26:25 UTC (rev 9692)
@@ -55,6 +55,7 @@
 A diferència de        LOC_PREP
 A dojo LOC_ADV
 A dretcient    LOC_ADV
+A esclatacor   LOC_ADV
 A excepció d'  LOC_PREP
 A excepció de  LOC_PREP
 A falta d'     LOC_PREP
@@ -63,6 +64,10 @@
 A favor d'     LOC_PREP
 A força d'     LOC_PREP
 A força de     LOC_PREP
+A forfollons   LOC_ADV
+A gatamèus     LOC_ADV
+A granel       LOC_ADV
+A gratcient    LOC_ADV
 A guisa d'     LOC_PREP
 A guisa de     LOC_PREP
 A l caient d'  LOC_PREP
@@ -131,6 +136,8 @@
 A la doba      LOC_ADJ
 A la llarga    LOC_ADV
 A la claror de LOC_PREP
+A la funerala  LOC_ADV
+A la gatzoneta LOC_ADV
 A la mercè d'  LOC_PREP
 A la mercè de  LOC_PREP
 A la recerca d'        LOC_PREP
@@ -140,6 +147,8 @@
 A la vora de   LOC_PREP
 A les envistes d'      LOC_PREP
 A les envistes de      LOC_PREP
+A les hurtes   LOC_ADV
+A lloure       LOC_ADV
 A manera d'    LOC_PREP
 A manera de    LOC_PREP
 A mercè d'     LOC_PREP
@@ -181,6 +190,7 @@
 A última hora  LOC_ADV
 Abans d'       LOC_PREP
 Abans de       LOC_PREP
+Ad hominem     LOC_ADJ
 Així mateix    LOC_CONJ
 Això no obstant        LOC_ADV
 Al capdavall   LOC_ADV
@@ -241,6 +251,7 @@
 D' esquerres   LOC_ADV
 D' esquitllada LOC_ADV
 D' esquitllentes       LOC_ADV
+D' esquitllèbit        LOC_ADV
 D' estar per casa      LOC_ADV
 D' estira i arronsa    LOC_ADV
 D' estranquis  LOC_ADV
@@ -279,6 +290,11 @@
 De debò        LOC_ADV
 De dretes      LOC_ADV
 De facto       LOC_ADV
+De folondres   LOC_ADV
+De futris      LOC_ADV
+De gaidó       LOC_ADV
+De gairell     LOC_ADV
+De genollons   LOC_ADV
 De gom a gom   LOC_ADV
 De iure        LOC_ADV
 De l tot       LOC_ADV
@@ -433,13 +449,20 @@
 Fins ara       LOC_ADV
 Fins avui      LOC_ADV
 Fins i tot     LOC_ADV
+Foie gras      NCMS000
 Front a front  LOC_ADV
 Futbol sala    NCMS000
 Gens ni mica   LOC_ADV
 Gran Bretanya  NPFSG00
 Gràcies a      LOC_PREP
+Grosso modo    LOC_ADV
 Honoris causa  LOC_ADV
 In extremis    LOC_ADV
+In extenso     LOC_ADV
+In fraganti    LOC_ADV
+In vitro       LOC_ADV
+In situ        LOC_ADV
+Ipso facto     LOC_ADV
 Junt amb       LOC_PREP
 Juntament amb  LOC_PREP
 Llevat d'      LOC_PREP
@@ -599,6 +622,7 @@
 a diferència de        LOC_PREP
 a dojo LOC_ADV
 a dretcient    LOC_ADV
+a esclatacor   LOC_ADV
 a excepció d'  LOC_PREP
 a excepció de  LOC_PREP
 a falta d'     LOC_PREP
@@ -607,6 +631,10 @@
 a favor d'     LOC_PREP
 a força d'     LOC_PREP
 a força de     LOC_PREP
+a forfollons   LOC_ADV
+a gatamèus     LOC_ADV
+a granel       LOC_ADV
+a gratcient    LOC_ADV
 a guisa d'     LOC_PREP
 a guisa de     LOC_PREP
 a l caient d'  LOC_PREP
@@ -673,6 +701,8 @@
 a la biorxa    LOC_ADV
 a la claror d' LOC_PREP
 a la claror de LOC_PREP
+a la funerala  LOC_ADV
+a la gatzoneta LOC_ADV
 a la doba      LOC_ADJ
 a la llarga    LOC_ADV
 a la mercè d'  LOC_PREP
@@ -684,6 +714,8 @@
 a la vora de   LOC_PREP
 a les envistes d'      LOC_PREP
 a les envistes de      LOC_PREP
+a les hurtes   LOC_ADV
+a lloure       LOC_ADV
 a manera d'    LOC_PREP
 a manera de    LOC_PREP
 a mercè d'     LOC_PREP
@@ -725,6 +757,7 @@
 a última hora  LOC_ADV
 abans d'       LOC_PREP
 abans de       LOC_PREP
+ad hominem     LOC_ADJ
 així mateix    LOC_CONJ
 això no obstant        LOC_ADV
 al capdavall   LOC_ADV
@@ -784,6 +817,7 @@
 d' esquerres   LOC_ADV
 d' esquitllada LOC_ADV
 d' esquitllentes       LOC_ADV
+d' esquitllèbit        LOC_ADV
 d' estar per casa      LOC_ADV
 d' estira i arronsa    LOC_ADV
 d' estranquis  LOC_ADV
@@ -823,6 +857,11 @@
 de dretes      LOC_ADV
 de facto       LOC_ADV
 de fiar        LOC_ADJ
+de folondres   LOC_ADV
+de futris      LOC_ADV
+de gaidó       LOC_ADV
+de gairell     LOC_ADV
+de genollons   LOC_ADV
 de gom a gom   LOC_ADV
 de iure        LOC_ADV
 de l tot       LOC_ADV
@@ -979,14 +1018,21 @@
 fins ara       LOC_ADV
 fins avui      LOC_ADV
 fins i tot     LOC_ADV
+foie gras      NCMS000
 front a front  LOC_ADV
 futbol sala    NCMS000
 gens ni mica   LOC_ADV
 gràcies a      LOC_PREP
+grosso modo    LOC_ADV
 hagudes i per haver    AQ0FP0
 haguts i per haver     AQ0MP0
 honoris causa  LOC_ADV
 in extremis    LOC_ADV
+in extenso     LOC_ADV
+in fraganti    LOC_ADV
+in vitro       LOC_ADV
+in situ        LOC_ADV
+ipso facto     LOC_ADV
 junt amb       LOC_PREP
 juntament amb  LOC_PREP
 llevat d'      LOC_PREP

Modified: 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/rules/ca/grammar.xml
===================================================================
--- 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/rules/ca/grammar.xml
        2013-03-14 23:41:04 UTC (rev 9691)
+++ 
trunk/languagetool/languagetool-language-modules/ca/src/main/resources/org/languagetool/rules/ca/grammar.xml
        2013-03-15 11:26:25 UTC (rev 9692)
@@ -6097,7 +6097,7 @@
                 <pattern>
                     <marker>
                         <token>de</token>
-                        <token regexp="yes">1|11</token>
+                        <token regexp="yes">(1|11)%?</token>
                     </marker>
                     <token><exception postag="decimal_comma"/></token>
                 </pattern>
@@ -6113,7 +6113,7 @@
                 <pattern>
                     <marker>
                         <token>el</token>
-                        <token regexp="yes">1|11</token>
+                        <token regexp="yes">(1|11)%?</token>
                     </marker>
                     <token><exception postag="decimal_comma"/></token>
                 </pattern>
@@ -6122,6 +6122,7 @@
                 <short>Error ortogràfic</short>
                 <example type="incorrect" correction="l'1"><marker>el 
1</marker> de febrer</example>
                 <example type="incorrect" correction="l'1"><marker>el 
1</marker>, el 2 i el 3 de febrer</example>
+                <example type="incorrect" correction="L'11%"><marker>El 
11%</marker> dels assistents.</example>
                 <example type="correct">el 1.400</example>
             </rule>
             <rule>
@@ -6129,7 +6130,7 @@
                     <marker>
                         <token regexp="yes">a|de</token>
                         <token spacebefore="no">l</token>
-                        <token regexp="yes">1|11</token>
+                        <token regexp="yes">(1|11)%?</token>
                     </marker>
                     <token><exception postag="decimal_comma"/></token>
                 </pattern>
@@ -6145,7 +6146,7 @@
                     <marker>
                         <token>pe</token>
                         <token spacebefore="no">l</token>
-                        <token regexp="yes">1|11</token>
+                        <token regexp="yes">(1|11)%?</token>
                     </marker>
                     <token><exception postag="decimal_comma"/></token>
                 </pattern>
@@ -9207,6 +9208,14 @@
                 <example type="correct">Lo Rat Penat</example>
                 <example type="correct">Lo Rat-Penat</example>
             </rule>
+            <rule id="LOS" name="los">
+               <pattern>
+                       <token>los</token>
+               </pattern>
+               <message>¿Volíeu dir <suggestion>els</suggestion>?</message>
+               <example type="incorrect" correction="els">Són 
<marker>los</marker> millors.</example>
+               <example type="correct">Són els millors.</example>
+            </rule>
         </rulegroup>
     </category>
     <category name="D1) Revisions opcionals">
@@ -10754,10 +10763,10 @@
             <suggestion><match no="1" postag="V.([ISNGP].*)" 
postag_regexp="yes" postag_replace="VM$1">anar</match></suggestion>
             <suggestion><match no="1" postag="V.([ISNGP].*)" 
postag_regexp="yes" postag_replace="VS$1">ser</match></suggestion>
             <short>Expressió incorrecta.</short>
-            <example type="incorrect" correction="aniran|seran">Les despeses 
<marker>correran</marker> a càrrec meu.</example>
+            <example type="incorrect" correction="aniran|iran|seran">Les 
despeses <marker>correran</marker> a càrrec meu.</example>
             <example type="correct">Les despeses seran a càrrec meu.</example>
         </rule>
-        <rule id="ENMIG" name="*en mig de / enmig de">
+        <!-- <rule id="ENMIG" name="*en mig de / enmig de">
             <pattern>
                 <marker>
                     <token>en</token>
@@ -10771,7 +10780,7 @@
             <example type="correct">En mig de les timonedes.</example>
             <example type="correct">en mig país</example>
             <example type="correct">Enmig de les timonedes.</example>
-        </rule>
+        </rule> -->
         <rule id="CASC_ANTIC" name="casc antic">
             <pattern>
                 <token inflected="yes">casc</token>

Modified: 
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/MorfologikCatalanSpellerRuleTest.java
===================================================================
--- 
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/MorfologikCatalanSpellerRuleTest.java
   2013-03-14 23:41:04 UTC (rev 9691)
+++ 
trunk/languagetool/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/MorfologikCatalanSpellerRuleTest.java
   2013-03-15 11:26:25 UTC (rev 9692)
@@ -68,6 +68,13 @@
 
         //incorrect sentences:
 
+        matches = rule.match(langTool.getAnalyzedSentence("joan"));
+        // check match positions:
+        assertEquals(1, matches.length);
+        assertEquals(0, matches[0].getFromPos());
+        assertEquals(4, matches[0].getToPos());
+        assertEquals("Joan", matches[0].getSuggestedReplacements().get(0));
+        
         matches = rule.match(langTool.getAnalyzedSentence("abatusats"));
         // check match positions:
         assertEquals(1, matches.length);

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Everyone hates slow websites. So do we.
Make your web apps faster with AppDynamics
Download AppDynamics Lite for free today:
http://p.sf.net/sfu/appdyn_d2d_mar
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits

Reply via email to