Revision: 6170
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=6170&view=rev
Author:   archeus
Date:     2012-01-06 12:19:18 +0000 (Fri, 06 Jan 2012)
Log Message:
-----------
Added a "synthesizer" that takes its information from a plain text file so the 
*.dict doesn't need to be recompiled for every dictionary change (like 
ManualTagger)

Modified Paths:
--------------
    trunk/JLanguageTool/src/java/org/languagetool/tagging/ManualTagger.java

Added Paths:
-----------
    
trunk/JLanguageTool/src/java/org/languagetool/synthesis/ManualSynthesizer.java
    
trunk/JLanguageTool/src/test/org/languagetool/synthesis/ManualSynthesizerTest.java

Added: 
trunk/JLanguageTool/src/java/org/languagetool/synthesis/ManualSynthesizer.java
===================================================================
--- 
trunk/JLanguageTool/src/java/org/languagetool/synthesis/ManualSynthesizer.java  
                            (rev 0)
+++ 
trunk/JLanguageTool/src/java/org/languagetool/synthesis/ManualSynthesizer.java  
    2012-01-06 12:19:18 UTC (rev 6170)
@@ -0,0 +1,99 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package org.languagetool.synthesis;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.languagetool.tagging.ManualTagger;
+import org.languagetool.tools.StringTools;
+
+/**
+ * A synthesizer that reads the inflected form and POS information from a 
plain (UTF-8) text file. <br/>
+ * This makes it possible for the user to edit the text file to let the system 
know
+ * about new words or missing readings in the synthesizer *.dict file.
+ * <p>
+ * File Format: <tt>fullform baseform postags</tt> (tab separated)
+ * 
+ * @author Ionuț Păduraru
+ * @see ManualTagger  
+ * @see BaseSynthesizer
+ */
+public class ManualSynthesizer {
+
+  /** a map with the key composed by the lema and POS (separated by "|"). The 
values are lists of inflected forms. */ 
+  private final Map<String, List<String>> mapping;
+
+  public ManualSynthesizer(final InputStream file) throws IOException {
+    mapping = loadMapping(file, "utf8");
+  }
+
+  /**
+   * Look up a word's inflected form as specified by the lemma and POS tag.
+   * 
+   * @param lemma the lemma to inflect.
+   * @param posTag the required POS tag.
+   * @return a list with all the inflected forms of the specified lemma having 
the specified POS tag. If no inflected form is found, the function returns 
<code><code>null</code>.
+   */
+  public List<String> lookup(final String lemma, final String posTag) {
+    return mapping.get(lemma + "|" + posTag);
+  }
+
+  private Map<String, List<String>> loadMapping(final InputStream file,
+      final String encoding) throws IOException {
+    // TODO consider refactoring: this is almost the same as 
BaseSynthesizer#loadMappings()
+    final Map<String, List<String>> map = new HashMap<String, List<String>>();
+    InputStreamReader isr = null;
+    BufferedReader br = null;
+    try {
+      isr = new InputStreamReader(file, encoding);
+      br = new BufferedReader(isr);
+      String line;
+      while ((line = br.readLine()) != null) {
+        if (StringTools.isEmpty(line) || line.charAt(0)=='#') {
+          continue;
+        }
+        final String[] parts = line.split("\t");
+        if (parts.length != 3) {
+          throw new IOException("Unknown format in " + file + ": " + line);
+        }
+        final String key = parts[1] + "|" + parts[2];
+        if (!map.containsKey(key)) {
+          map.put(key, new ArrayList<String>());
+        }
+        map.get(key).add(parts[0]);
+      }
+    } finally {
+      if (br != null) {
+        br.close();
+      }
+      if (isr != null) {
+        isr.close();
+      }
+    }
+    return map;
+  }
+
+}

Modified: 
trunk/JLanguageTool/src/java/org/languagetool/tagging/ManualTagger.java
===================================================================
--- trunk/JLanguageTool/src/java/org/languagetool/tagging/ManualTagger.java     
2012-01-06 11:08:28 UTC (rev 6169)
+++ trunk/JLanguageTool/src/java/org/languagetool/tagging/ManualTagger.java     
2012-01-06 12:19:18 UTC (rev 6170)
@@ -27,6 +27,7 @@
 import java.util.List;
 import java.util.Map;
 
+import org.languagetool.synthesis.ManualSynthesizer;
 import org.languagetool.tools.StringTools;
 
 
@@ -40,6 +41,8 @@
  * File Format: <tt>fullform baseform postags</tt> (tab separated)
  * 
  * @author Daniel Naber
+ * 
+ * @see ManualSynthesizer
  */
 public class ManualTagger {
 
@@ -76,6 +79,7 @@
 
   private Map<String, List<LookedUpTerm>> loadMapping(final InputStream file,
       final String encoding) throws IOException {
+    // TODO consider refactoring: this is almost the same as 
ManualSynthesizer#loadMappings()
     final Map<String, List<LookedUpTerm>> map = new HashMap<String, 
List<LookedUpTerm>>();
     InputStreamReader isr = null;
     BufferedReader br = null;

Added: 
trunk/JLanguageTool/src/test/org/languagetool/synthesis/ManualSynthesizerTest.java
===================================================================
--- 
trunk/JLanguageTool/src/test/org/languagetool/synthesis/ManualSynthesizerTest.java
                          (rev 0)
+++ 
trunk/JLanguageTool/src/test/org/languagetool/synthesis/ManualSynthesizerTest.java
  2012-01-06 12:19:18 UTC (rev 6170)
@@ -0,0 +1,80 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package org.languagetool.synthesis;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+/**
+ * Test class for {@link ManualSynthesizer}.
+ * 
+ * @author Ionuț Păduraru
+ */
+public class ManualSynthesizerTest extends TestCase {
+
+  private ManualSynthesizer synthesizer;
+
+  @Override
+  protected void setUp() throws Exception {
+    super.setUp();
+    final String data = 
+      "# some test data\n" +
+      "InflectedForm11\tLemma1\tPOS1\n" +
+      "InflectedForm121\tLemma1\tPOS2\n" +
+      "InflectedForm122\tLemma1\tPOS2\n" +
+      "InflectedForm2\tLemma2\tPOS1\n"
+      ;
+    synthesizer = new ManualSynthesizer(new 
ByteArrayInputStream(data.getBytes("UTF-8")));
+  }
+
+  /**
+   * Lookup values that do not exist in the dictionary.
+   */
+  public void testLookupNonExisting() throws IOException {
+    assertNull(synthesizer.lookup("", ""));
+    assertNull(synthesizer.lookup("", null));
+    assertNull(synthesizer.lookup(null, ""));
+    assertNull(synthesizer.lookup(null, null));
+    assertNull(synthesizer.lookup("NONE", "UNKNOWN"));
+  }
+
+  /**
+   * Lookup values that do not exist in the dictionary but they do exist in 
different form (like other POS).
+   */
+  public void testInvalidLookup() throws IOException {
+    assertNull(synthesizer.lookup("NONE", "POS1"));
+    assertNull(synthesizer.lookup("Lemma1", "UNKNOWN"));
+    assertNull(synthesizer.lookup("Lemma1", "POS.")); // no reg exp
+    assertNull(synthesizer.lookup("Lemma2", "POS2"));
+  }
+
+  public void testValidLookup() throws IOException {
+    assertEquals("[InflectedForm11]", 
String.valueOf(synthesizer.lookup("Lemma1", "POS1")));
+    assertEquals("[InflectedForm121, InflectedForm122]", 
String.valueOf(synthesizer.lookup("Lemma1", "POS2")));
+    assertEquals("[InflectedForm2]", 
String.valueOf(synthesizer.lookup("Lemma2", "POS1")));
+  }
+
+  public void testCaseSensitive() throws IOException {
+    // lookup is case sensitive:
+    assertNull(synthesizer.lookup("LEmma1", "POS1"));
+  }
+  
+}

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Ridiculously easy VDI. With Citrix VDI-in-a-Box, you don't need a complex
infrastructure or vast IT resources to deliver seamless, secure access to
virtual desktops. With this all-in-one solution, easily deploy virtual 
desktops for less than the cost of PCs and save 60% on VDI infrastructure 
costs. Try it free! http://p.sf.net/sfu/Citrix-VDIinabox
_______________________________________________
Languagetool-cvs mailing list
Languagetool-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs

Reply via email to