Revision: 6170
http://languagetool.svn.sourceforge.net/languagetool/?rev=6170&view=rev
Author: archeus
Date: 2012-01-06 12:19:18 +0000 (Fri, 06 Jan 2012)
Log Message:
-----------
Added a "synthesizer" that takes its information from a plain text file so the
*.dict doesn't need to be recompiled for every dictionary change (like
ManualTagger)
Modified Paths:
--------------
trunk/JLanguageTool/src/java/org/languagetool/tagging/ManualTagger.java
Added Paths:
-----------
trunk/JLanguageTool/src/java/org/languagetool/synthesis/ManualSynthesizer.java
trunk/JLanguageTool/src/test/org/languagetool/synthesis/ManualSynthesizerTest.java
Added:
trunk/JLanguageTool/src/java/org/languagetool/synthesis/ManualSynthesizer.java
===================================================================
---
trunk/JLanguageTool/src/java/org/languagetool/synthesis/ManualSynthesizer.java
(rev 0)
+++
trunk/JLanguageTool/src/java/org/languagetool/synthesis/ManualSynthesizer.java
2012-01-06 12:19:18 UTC (rev 6170)
@@ -0,0 +1,99 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package org.languagetool.synthesis;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.languagetool.tagging.ManualTagger;
+import org.languagetool.tools.StringTools;
+
+/**
+ * A synthesizer that reads the inflected form and POS information from a
plain (UTF-8) text file. <br/>
+ * This makes it possible for the user to edit the text file to let the system
know
+ * about new words or missing readings in the synthesizer *.dict file.
+ * <p>
+ * File Format: <tt>fullform baseform postags</tt> (tab separated)
+ *
+ * @author Ionuț Păduraru
+ * @see ManualTagger
+ * @see BaseSynthesizer
+ */
+public class ManualSynthesizer {
+
+ /** a map with the key composed by the lema and POS (separated by "|"). The
values are lists of inflected forms. */
+ private final Map<String, List<String>> mapping;
+
+ public ManualSynthesizer(final InputStream file) throws IOException {
+ mapping = loadMapping(file, "utf8");
+ }
+
+ /**
+ * Look up a word's inflected form as specified by the lemma and POS tag.
+ *
+ * @param lemma the lemma to inflect.
+ * @param posTag the required POS tag.
+ * @return a list with all the inflected forms of the specified lemma having
the specified POS tag. If no inflected form is found, the function returns
<code><code>null</code>.
+ */
+ public List<String> lookup(final String lemma, final String posTag) {
+ return mapping.get(lemma + "|" + posTag);
+ }
+
+ private Map<String, List<String>> loadMapping(final InputStream file,
+ final String encoding) throws IOException {
+ // TODO consider refactoring: this is almost the same as
BaseSynthesizer#loadMappings()
+ final Map<String, List<String>> map = new HashMap<String, List<String>>();
+ InputStreamReader isr = null;
+ BufferedReader br = null;
+ try {
+ isr = new InputStreamReader(file, encoding);
+ br = new BufferedReader(isr);
+ String line;
+ while ((line = br.readLine()) != null) {
+ if (StringTools.isEmpty(line) || line.charAt(0)=='#') {
+ continue;
+ }
+ final String[] parts = line.split("\t");
+ if (parts.length != 3) {
+ throw new IOException("Unknown format in " + file + ": " + line);
+ }
+ final String key = parts[1] + "|" + parts[2];
+ if (!map.containsKey(key)) {
+ map.put(key, new ArrayList<String>());
+ }
+ map.get(key).add(parts[0]);
+ }
+ } finally {
+ if (br != null) {
+ br.close();
+ }
+ if (isr != null) {
+ isr.close();
+ }
+ }
+ return map;
+ }
+
+}
Modified:
trunk/JLanguageTool/src/java/org/languagetool/tagging/ManualTagger.java
===================================================================
--- trunk/JLanguageTool/src/java/org/languagetool/tagging/ManualTagger.java
2012-01-06 11:08:28 UTC (rev 6169)
+++ trunk/JLanguageTool/src/java/org/languagetool/tagging/ManualTagger.java
2012-01-06 12:19:18 UTC (rev 6170)
@@ -27,6 +27,7 @@
import java.util.List;
import java.util.Map;
+import org.languagetool.synthesis.ManualSynthesizer;
import org.languagetool.tools.StringTools;
@@ -40,6 +41,8 @@
* File Format: <tt>fullform baseform postags</tt> (tab separated)
*
* @author Daniel Naber
+ *
+ * @see ManualSynthesizer
*/
public class ManualTagger {
@@ -76,6 +79,7 @@
private Map<String, List<LookedUpTerm>> loadMapping(final InputStream file,
final String encoding) throws IOException {
+ // TODO consider refactoring: this is almost the same as
ManualSynthesizer#loadMappings()
final Map<String, List<LookedUpTerm>> map = new HashMap<String,
List<LookedUpTerm>>();
InputStreamReader isr = null;
BufferedReader br = null;
Added:
trunk/JLanguageTool/src/test/org/languagetool/synthesis/ManualSynthesizerTest.java
===================================================================
---
trunk/JLanguageTool/src/test/org/languagetool/synthesis/ManualSynthesizerTest.java
(rev 0)
+++
trunk/JLanguageTool/src/test/org/languagetool/synthesis/ManualSynthesizerTest.java
2012-01-06 12:19:18 UTC (rev 6170)
@@ -0,0 +1,80 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package org.languagetool.synthesis;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+/**
+ * Test class for {@link ManualSynthesizer}.
+ *
+ * @author Ionuț Păduraru
+ */
+public class ManualSynthesizerTest extends TestCase {
+
+ private ManualSynthesizer synthesizer;
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ final String data =
+ "# some test data\n" +
+ "InflectedForm11\tLemma1\tPOS1\n" +
+ "InflectedForm121\tLemma1\tPOS2\n" +
+ "InflectedForm122\tLemma1\tPOS2\n" +
+ "InflectedForm2\tLemma2\tPOS1\n"
+ ;
+ synthesizer = new ManualSynthesizer(new
ByteArrayInputStream(data.getBytes("UTF-8")));
+ }
+
+ /**
+ * Lookup values that do not exist in the dictionary.
+ */
+ public void testLookupNonExisting() throws IOException {
+ assertNull(synthesizer.lookup("", ""));
+ assertNull(synthesizer.lookup("", null));
+ assertNull(synthesizer.lookup(null, ""));
+ assertNull(synthesizer.lookup(null, null));
+ assertNull(synthesizer.lookup("NONE", "UNKNOWN"));
+ }
+
+ /**
+ * Lookup values that do not exist in the dictionary but they do exist in
different form (like other POS).
+ */
+ public void testInvalidLookup() throws IOException {
+ assertNull(synthesizer.lookup("NONE", "POS1"));
+ assertNull(synthesizer.lookup("Lemma1", "UNKNOWN"));
+ assertNull(synthesizer.lookup("Lemma1", "POS.")); // no reg exp
+ assertNull(synthesizer.lookup("Lemma2", "POS2"));
+ }
+
+ public void testValidLookup() throws IOException {
+ assertEquals("[InflectedForm11]",
String.valueOf(synthesizer.lookup("Lemma1", "POS1")));
+ assertEquals("[InflectedForm121, InflectedForm122]",
String.valueOf(synthesizer.lookup("Lemma1", "POS2")));
+ assertEquals("[InflectedForm2]",
String.valueOf(synthesizer.lookup("Lemma2", "POS1")));
+ }
+
+ public void testCaseSensitive() throws IOException {
+ // lookup is case sensitive:
+ assertNull(synthesizer.lookup("LEmma1", "POS1"));
+ }
+
+}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Ridiculously easy VDI. With Citrix VDI-in-a-Box, you don't need a complex
infrastructure or vast IT resources to deliver seamless, secure access to
virtual desktops. With this all-in-one solution, easily deploy virtual
desktops for less than the cost of PCs and save 60% on VDI infrastructure
costs. Try it free! http://p.sf.net/sfu/Citrix-VDIinabox
_______________________________________________
Languagetool-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs