[LanguageTool] SF.net SVN: languagetool:[6001] trunk/JLanguageTool/src

dnaber Thu, 01 Dec 2011 14:40:58 -0800

Revision: 6001
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=6001&view=rev
Author:   dnaber
Date:     2011-12-01 22:40:48 +0000 (Thu, 01 Dec 2011)
Log Message:
-----------
[de] remove manual list of inflected words, now that our version of the Morphy 
dictionary is more complete wrt. to German spelling reform


Modified Paths:
--------------
    
trunk/JLanguageTool/src/java/org/languagetool/rules/de/WordCoherencyRule.java
    trunk/JLanguageTool/src/resource/de/added.txt
    
trunk/JLanguageTool/src/test/org/languagetool/rules/de/WordCoherencyRuleTest.java

Removed Paths:
-------------
    trunk/JLanguageTool/src/java/org/languagetool/rules/de/GermanLemmatizer.java
    trunk/JLanguageTool/src/rules/de/fullform2baseform.txt

Deleted: 
trunk/JLanguageTool/src/java/org/languagetool/rules/de/GermanLemmatizer.java
===================================================================
--- 
trunk/JLanguageTool/src/java/org/languagetool/rules/de/GermanLemmatizer.java    
    2011-11-30 16:39:13 UTC (rev 6000)
+++ 
trunk/JLanguageTool/src/java/org/languagetool/rules/de/GermanLemmatizer.java    
    2011-12-01 22:40:48 UTC (rev 6001)
@@ -1,84 +0,0 @@
-/* LanguageTool, a natural language style checker 
- * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
- * 
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
- * USA
- */
-package org.languagetool.rules.de;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.languagetool.JLanguageTool;
-
-/**
- * Trivial German lemmatizer that can simply find the baseforms of
- * those fullforms listed in <code>rules/de/fullform2baseform.txt</code>.
- * 
- * @author Daniel Naber
- */
-class GermanLemmatizer {
-
-  private static final String FILE_NAME = "/de/fullform2baseform.txt";
-  private static final String FILE_ENCODING = "utf-8";
-  
-  private final Map<String, String> fullform2baseform;
-  
-  GermanLemmatizer() throws IOException {
-    fullform2baseform = 
loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILE_NAME));
-  }
-  
-  String getBaseform(final String fullform) {
-    return fullform2baseform.get(fullform);
-  }
-  
-  private Map<String, String> loadWords(InputStream file) throws IOException {
-    final Map<String, String> map = new HashMap<String, String>();
-    InputStreamReader isr = null;
-    BufferedReader br = null;
-    try {
-      isr = new InputStreamReader(file, FILE_ENCODING);
-      br = new BufferedReader(isr);
-      String line;
-      while ((line = br.readLine()) != null) {
-        line = line.trim();
-        if (line.length() < 1) { //ignore empty lines
-          continue;
-        }
-        if (line.charAt(0) == '#') {      // ignore comments
-          continue;
-        }
-        final String[] parts = line.split(":");
-        if (parts.length != 2) {
-          throw new IOException("Format error in file " 
+JLanguageTool.getDataBroker().getFromRulesDirAsUrl(FILE_NAME)+", line: " + 
line);
-        }
-        final String baseform = parts[0];
-        final String[] fullforms = parts[1].split(",");
-        for (String fullform : fullforms) {
-          map.put(fullform.trim(), baseform);
-        }
-      }
-    } finally {
-      if (br != null) br.close();
-      if (isr != null) isr.close();
-    }
-    return map;
-  }
-
-}

Modified: 
trunk/JLanguageTool/src/java/org/languagetool/rules/de/WordCoherencyRule.java
===================================================================
--- 
trunk/JLanguageTool/src/java/org/languagetool/rules/de/WordCoherencyRule.java   
    2011-11-30 16:39:13 UTC (rev 6000)
+++ 
trunk/JLanguageTool/src/java/org/languagetool/rules/de/WordCoherencyRule.java   
    2011-12-01 22:40:48 UTC (rev 6001)
@@ -54,13 +54,11 @@
   private final Map<String, String> relevantWords;        // e.g. "aufwendig 
-> aufwändig"
   private Map<String, RuleMatch> shouldNotAppearWord = new HashMap<String, 
RuleMatch>();  // e.g. aufwändig -> RuleMatch of aufwendig
 
-  private final GermanLemmatizer germanLemmatizer;
-  
   public WordCoherencyRule(ResourceBundle messages) throws IOException {
-    if (messages != null)
+    if (messages != null) {
       super.setCategory(new Category(messages.getString("category_misc")));
+    }
     relevantWords = 
loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILE_NAME)); 
-    germanLemmatizer = new GermanLemmatizer();
   }
   
   @Override
@@ -89,17 +87,11 @@
       } else {
         final String origToken = token;
         final List<AnalyzedToken> readings = tmpToken.getReadings();
-        // TODO: in theory we need to care about the other readings, too:
+        // TODO: in theory we need to care about the other readings, too 
(affects e.g. German "Schenke" as a noun):
         if (readings != null && readings.size() > 0) {
           final String baseform = readings.get(0).getLemma();
           if (baseform != null) {
             token = baseform;
-          } else {
-            // not all words are known by the Tagger (esp. compounds), so use 
the
-            // file lookup:
-            final String manualLookup = 
germanLemmatizer.getBaseform(origToken);
-            if (manualLookup != null)
-              token = manualLookup;
           }
         }
         if (shouldNotAppearWord.containsKey(token)) {

Modified: trunk/JLanguageTool/src/resource/de/added.txt
===================================================================
--- trunk/JLanguageTool/src/resource/de/added.txt       2011-11-30 16:39:13 UTC 
(rev 6000)
+++ trunk/JLanguageTool/src/resource/de/added.txt       2011-12-01 22:40:48 UTC 
(rev 6001)
@@ -299,3 +299,67 @@
 Protonen       Proton  SUB:GEN:PLU:NEU
 Protonen       Proton  SUB:DAT:PLU:NEU
 Protonen       Proton  SUB:AKK:PLU:NEU
+Xylophon       Xylophon        SUB:NOM:SIN:NEU
+Xylophons      Xylophon        SUB:GEN:SIN:NEU
+Xylophon       Xylophon        SUB:DAT:SIN:NEU
+Xylophon       Xylophon        SUB:AKK:SIN:NEU
+Xylophone      Xylophon        SUB:NOM:PLU:NEU
+Xylophone      Xylophon        SUB:GEN:PLU:NEU
+Xylophonen     Xylophon        SUB:DAT:PLU:NEU
+Xylophone      Xylophon        SUB:AKK:PLU:NEU
+Xylofon        Xylofon SUB:NOM:SIN:NEU
+Xylofons       Xylofon SUB:GEN:SIN:NEU
+Xylofon        Xylofon SUB:DAT:SIN:NEU
+Xylofon        Xylofon SUB:AKK:SIN:NEU
+Xylofone       Xylofon SUB:NOM:PLU:NEU
+Xylofone       Xylofon SUB:GEN:PLU:NEU
+Xylofonen      Xylofon SUB:DAT:PLU:NEU
+Xylofone       Xylofon SUB:AKK:PLU:NEU
+Schenke        Schenke SUB:NOM:SIN:FEM
+Schenke        Schenke SUB:GEN:SIN:FEM
+Schenke        Schenke SUB:DAT:SIN:FEM
+Schenke        Schenke SUB:AKK:SIN:FEM
+Schenken       Schenke SUB:NOM:PLU:FEM
+Schenken       Schenke SUB:GEN:PLU:FEM
+Schenken       Schenke SUB:DAT:PLU:FEM
+Schenken       Schenke SUB:AKK:PLU:FEM
+Potenzial      Potenzial       SUB:NOM:SIN:NEU
+Potenzials     Potenzial       SUB:GEN:SIN:NEU
+Potenzial      Potenzial       SUB:DAT:SIN:NEU
+Potenzial      Potenzial       SUB:AKK:SIN:NEU
+Potenziale     Potenzial       SUB:NOM:PLU:NEU
+Potenziale     Potenzial       SUB:GEN:PLU:NEU
+Potenzialen    Potenzial       SUB:DAT:PLU:NEU
+Potenziale     Potenzial       SUB:AKK:PLU:NEU
+Nessessär      Nessessär       SUB:NOM:SIN:NEU
+Nessessärs     Nessessär       SUB:GEN:SIN:NEU
+Nessessär      Nessessär       SUB:DAT:SIN:NEU
+Nessessär      Nessessär       SUB:AKK:SIN:NEU
+Nessessäres    Nessessär       SUB:NOM:PLU:NEU
+Nessessäres    Nessessär       SUB:GEN:PLU:NEU
+Nessessäres    Nessessär       SUB:DAT:PLU:NEU
+Nessessäres    Nessessär       SUB:AKK:PLU:NEU
+Necessaire     Necessaire      SUB:NOM:SIN:NEU
+Necessaires    Necessaire      SUB:GEN:SIN:NEU
+Necessaire     Necessaire      SUB:DAT:SIN:NEU
+Necessaire     Necessaire      SUB:AKK:SIN:NEU
+Necessaires    Necessaire      SUB:NOM:PLU:NEU
+Necessaires    Necessaire      SUB:GEN:PLU:NEU
+Necessaires    Necessaire      SUB:DAT:PLU:NEU
+Necessaires    Necessaire      SUB:AKK:PLU:NEU
+Kommuniqué     Kommuniqué      SUB:NOM:SIN:NEU
+Kommuniqués    Kommuniqué      SUB:GEN:SIN:NEU
+Kommuniqué     Kommuniqué      SUB:DAT:SIN:NEU
+Kommuniqué     Kommuniqué      SUB:AKK:SIN:NEU
+Kommuniqués    Kommuniqué      SUB:NOM:PLU:NEU
+Kommuniqués    Kommuniqué      SUB:GEN:PLU:NEU
+Kommuniqués    Kommuniqué      SUB:DAT:PLU:NEU
+Kommuniqués    Kommuniqué      SUB:AKK:PLU:NEU
+Facette        Facette SUB:NOM:SIN:FEM
+Facette        Facette SUB:GEN:SIN:FEM
+Facette        Facette SUB:DAT:SIN:FEM
+Facette        Facette SUB:AKK:SIN:FEM
+Facetten       Facette SUB:NOM:PLU:FEM
+Facetten       Facette SUB:GEN:PLU:FEM
+Facetten       Facette SUB:DAT:PLU:FEM
+Facetten       Facette SUB:AKK:PLU:FEM

Deleted: trunk/JLanguageTool/src/rules/de/fullform2baseform.txt
===================================================================
--- trunk/JLanguageTool/src/rules/de/fullform2baseform.txt      2011-11-30 
16:39:13 UTC (rev 6000)
+++ trunk/JLanguageTool/src/rules/de/fullform2baseform.txt      2011-12-01 
22:40:48 UTC (rev 6001)
@@ -1,40 +0,0 @@
-# Map fullforms to baseforms, e.g. gehen:gehe,gehst,ging,...
-# This file must be encoded in UTF-8.
-aufwendig:aufwendige, aufwendiges, aufwendiger,aufwendigen,aufwendigem, 
aufwendigere, aufwendigeres, aufwendigeren, aufwendigerem, aufwendigste, 
aufwendigstes, aufwendigster, aufwendigsten, aufwendigstem
-aufwändig:aufwändige, aufwändiges, aufwändiger, aufwändigen, aufwändigem, 
aufwändigere, aufwändigeres, aufwändigeren, aufwändigerem, aufwändigste, 
aufwändigstes, aufwändigster, aufwändigsten, aufwändigstem
-Delfin:Delfine, Delfins, Delfines, Delfinen
-Dephfin:Dephfine, Delphins, Delphines, Delphfinen
-essentiell:essentielle, essentiellem, essentiellen, essentieller, 
essentiellere, essentiellerem, essentielleren, essentiellerer, essentielleres, 
essentielles, essentiellst, essentiellste, essentiellstem, essentiellsten, 
essentiellstens, essentiellster, essentiellstes
-essenziell:essenzielle, essenziellem, essenziellen, essenzieller, 
essenziellere, essenziellerem, essenzielleren, essenziellerer, essenzielleres, 
essenzielles, essenziellst, essenziellste, essenziellstem, essenziellsten, 
essenziellstens, essenziellster, essenziellstes
-Differential:Differentiale, Differentialen, Differentials
-Differenzial:Differenziale, Differenzialen, Differenziales, Differenzials
-Facette:Facetten
-Fassette:Fassetten
-Joghurt:Joghurts
-Jogurt:Jogurts
-Ketchup:Ketchups
-Ketschup:Ketschups
-Kommuniqué:Kommuniqués
-Kommunikee:Kommunikee
-Necessaire:Necessaires
-Nessessär:Nessessärs
-Orthographie:Orthographien
-Ortografie:Ortografien
-Potential:Potentiale, Potentials, Potentiales, Potentialen
-Potenzial:Potenziale, Potenzials, Potenziales, Potenzialen
-Portemonnaie:Portemonnaies
-Portmonee:Portmonees
-potentiell:potentielle, potentiellem, potentiellen, potentieller, potentielles
-potenziell:potenzielle, potenziellem, potenziellen, potenzieller, potenzielles
-Schenke:Schenken
-Schänke:Schänken
-substantiell:substantielle, substantiellem, substantiellen, substantieller, 
substantiellere, substantiellerem, substantielleren, substantiellerer, 
substantielleres, substantielles, substantiellst, substantiellste, 
substantiellstem, substantiellsten, substantiellstens, substantiellster, 
substantiellstes
-substanziell:substanzielle, substanziellem, substanziellen, substanzieller, 
substanziellere, substanziellerem, substanzielleren, substanziellerer, 
substanzielleres, substanzielles, substanziellst, substanziellste, 
substanziellstem, substanziellsten, substanziellstens, substanziellster, 
substanziellstes
-Thunfisch:Thunfische, Thunfischen, Thunfisches, Thunfischs
-Tunfisch:Tunfische, Tunfischen, Tunfisches, Tunfischs
-Xylophon:Xylofone, Xylofonen, Xylofons
-Xylofon:Xylophone, Xylophonen, Xylophons
-selbständig:selbständige, selbständiger, selbständiges, selbständigen, 
selbständigem, selbständigeres
-selbstständig:selbstständige, selbstständiger, selbstständiges, 
selbstständigen, selbstständigem, selbstständigeres
-Bahnhofsplatz:Bahnhofsplatzes, Bahnhofsplätze, Bahnhofsplätzen
-Bahnhofplatz:Bahnhofplatzes, Bahnhofplätze, Bahnhofplätzen

Modified: 
trunk/JLanguageTool/src/test/org/languagetool/rules/de/WordCoherencyRuleTest.java
===================================================================
--- 
trunk/JLanguageTool/src/test/org/languagetool/rules/de/WordCoherencyRuleTest.java
   2011-11-30 16:39:13 UTC (rev 6000)
+++ 
trunk/JLanguageTool/src/test/org/languagetool/rules/de/WordCoherencyRuleTest.java
   2011-12-01 22:40:48 UTC (rev 6001)
@@ -41,18 +41,61 @@
     rule.reset();
     assertEquals(0, rule.match(langTool.getAnalyzedSentence("Das ist 
aufwändig, aber nicht zu aufwändig.")).length);
     // errors:
-    rule.reset();
-    assertEquals(1, rule.match(langTool.getAnalyzedSentence("Das ist 
aufwendig, aber nicht zu aufwändig.")).length);
-    rule.reset();
-    assertEquals(1, rule.match(langTool.getAnalyzedSentence("Das ist 
aufwändig, aber nicht zu aufwendig.")).length);
+    assertError("Das ist aufwendig, aber nicht zu aufwändig.", langTool);
+    assertError("Das ist aufwendiger, aber nicht zu aufwändig.", langTool);
+    assertError("Das ist aufwändig, aber nicht zu aufwendig.", langTool);
+    assertError("Das ist aufwändiger, aber nicht zu aufwendig.", langTool);
+    assertError("Delfin und Delphin", langTool);
+    assertError("Delfins und Delphine", langTool);
+    assertError("essentiell und essenziell", langTool);
+    assertError("essentieller und essenzielles", langTool);
+    assertError("Differential und Differenzial", langTool);
+    assertError("Differentials und Differenzials", langTool);
+    assertError("Facette und Fassette", langTool);
+    assertError("Facetten und Fassetten", langTool);
+    assertError("Joghurt und Jogurt", langTool);
+    assertError("Joghurts und Jogurt", langTool);
+    assertError("Joghurt und Jogurts", langTool);
+    assertError("Joghurts und Jogurts", langTool);
+    assertError("Ketchup und Ketschup", langTool);
+    assertError("Ketchups und Ketschups", langTool);
+    assertError("Kommuniqué und Kommunikee", langTool);
+    assertError("Kommuniqués und Kommunikees", langTool);
+    assertError("Necessaire und Nessessär", langTool);
+    assertError("Necessaires und Nessessärs", langTool);
+    assertError("Orthographie und Orthografie", langTool);
+    assertError("Orthographien und Orthografien", langTool);
+    assertError("Potential und Potenzial", langTool);
+    assertError("Potentials und Potenziale", langTool);
+    assertError("Portemonnaie und Portmonee", langTool);
+    assertError("Portemonnaies und Portmonees", langTool);
+    assertError("potentiell und potenziell", langTool);
+    assertError("potentielles und potenzieller", langTool);
+    assertError("Schenke und Schänke", langTool);
+    // see TODO comment in WordCoherencyRule:
+    //assertError("Schenken und Schänken", langTool);
+    assertError("substantiell und substanziell", langTool);
+    assertError("substantieller und substanzielles", langTool);
+    assertError("Thunfisch und Tunfisch", langTool);
+    assertError("Thunfische und Tunfische", langTool);
+    assertError("Xylophon und Xylofon", langTool);
+    assertError("Xylophone und Xylofone", langTool);
+    assertError("selbständig und selbstständig", langTool);
+    assertError("selbständiges und selbstständiger", langTool);
+    assertError("Bahnhofsplatz und Bahnhofplatz", langTool);
+    // TODO: known to fail because jWordSplitters list is not complete:
+    //assertError("Testketchup und Testketschup", langTool);
   }
-  
+
+  private void assertError(String s, JLanguageTool langTool) throws 
IOException {
+    final WordCoherencyRule rule = new WordCoherencyRule(null);
+    assertEquals(1, rule.match(langTool.getAnalyzedSentence(s)).length);
+  }
+
   public void testRuleCompleteTexts() throws IOException {
     final JLanguageTool langTool;
     // complete texts:
     List<RuleMatch> matches;
-    //matches = langTool.check("Das ist aufwendig. Aber hallo. Es ist wirklich 
aufwendig.");
-    //assertEquals(0, matches.size());
     langTool = new JLanguageTool(Language.GERMAN);
     matches = langTool.check("Das ist aufwändig. Aber hallo. Es ist wirklich 
aufwändig.");
     assertEquals(0, matches.size());

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
All the data continuously generated in your IT infrastructure 
contains a definitive record of customers, application performance, 
security threats, fraudulent activity, and more. Splunk takes this 
data and makes sense of it. IT sense. And common sense.
http://p.sf.net/sfu/splunk-novd2d
_______________________________________________
Languagetool-cvs mailing list
Languagetool-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs

[LanguageTool] SF.net SVN: languagetool:[6001] trunk/JLanguageTool/src

Reply via email to