Revision: 6170 http://languagetool.svn.sourceforge.net/languagetool/?rev=6170&view=rev Author: archeus Date: 2012-01-06 12:19:18 +0000 (Fri, 06 Jan 2012) Log Message: ----------- Added a "synthesizer" that takes its information from a plain text file so the *.dict doesn't need to be recompiled for every dictionary change (like ManualTagger)
Modified Paths: -------------- trunk/JLanguageTool/src/java/org/languagetool/tagging/ManualTagger.java Added Paths: ----------- trunk/JLanguageTool/src/java/org/languagetool/synthesis/ManualSynthesizer.java trunk/JLanguageTool/src/test/org/languagetool/synthesis/ManualSynthesizerTest.java Added: trunk/JLanguageTool/src/java/org/languagetool/synthesis/ManualSynthesizer.java =================================================================== --- trunk/JLanguageTool/src/java/org/languagetool/synthesis/ManualSynthesizer.java (rev 0) +++ trunk/JLanguageTool/src/java/org/languagetool/synthesis/ManualSynthesizer.java 2012-01-06 12:19:18 UTC (rev 6170) @@ -0,0 +1,99 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package org.languagetool.synthesis; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.languagetool.tagging.ManualTagger; +import org.languagetool.tools.StringTools; + +/** + * A synthesizer that reads the inflected form and POS information from a plain (UTF-8) text file. <br/> + * This makes it possible for the user to edit the text file to let the system know + * about new words or missing readings in the synthesizer *.dict file. + * <p> + * File Format: <tt>fullform baseform postags</tt> (tab separated) + * + * @author Ionuț Păduraru + * @see ManualTagger + * @see BaseSynthesizer + */ +public class ManualSynthesizer { + + /** a map with the key composed by the lema and POS (separated by "|"). The values are lists of inflected forms. */ + private final Map<String, List<String>> mapping; + + public ManualSynthesizer(final InputStream file) throws IOException { + mapping = loadMapping(file, "utf8"); + } + + /** + * Look up a word's inflected form as specified by the lemma and POS tag. + * + * @param lemma the lemma to inflect. + * @param posTag the required POS tag. + * @return a list with all the inflected forms of the specified lemma having the specified POS tag. If no inflected form is found, the function returns <code><code>null</code>. + */ + public List<String> lookup(final String lemma, final String posTag) { + return mapping.get(lemma + "|" + posTag); + } + + private Map<String, List<String>> loadMapping(final InputStream file, + final String encoding) throws IOException { + // TODO consider refactoring: this is almost the same as BaseSynthesizer#loadMappings() + final Map<String, List<String>> map = new HashMap<String, List<String>>(); + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, encoding); + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + if (StringTools.isEmpty(line) || line.charAt(0)=='#') { + continue; + } + final String[] parts = line.split("\t"); + if (parts.length != 3) { + throw new IOException("Unknown format in " + file + ": " + line); + } + final String key = parts[1] + "|" + parts[2]; + if (!map.containsKey(key)) { + map.put(key, new ArrayList<String>()); + } + map.get(key).add(parts[0]); + } + } finally { + if (br != null) { + br.close(); + } + if (isr != null) { + isr.close(); + } + } + return map; + } + +} Modified: trunk/JLanguageTool/src/java/org/languagetool/tagging/ManualTagger.java =================================================================== --- trunk/JLanguageTool/src/java/org/languagetool/tagging/ManualTagger.java 2012-01-06 11:08:28 UTC (rev 6169) +++ trunk/JLanguageTool/src/java/org/languagetool/tagging/ManualTagger.java 2012-01-06 12:19:18 UTC (rev 6170) @@ -27,6 +27,7 @@ import java.util.List; import java.util.Map; +import org.languagetool.synthesis.ManualSynthesizer; import org.languagetool.tools.StringTools; @@ -40,6 +41,8 @@ * File Format: <tt>fullform baseform postags</tt> (tab separated) * * @author Daniel Naber + * + * @see ManualSynthesizer */ public class ManualTagger { @@ -76,6 +79,7 @@ private Map<String, List<LookedUpTerm>> loadMapping(final InputStream file, final String encoding) throws IOException { + // TODO consider refactoring: this is almost the same as ManualSynthesizer#loadMappings() final Map<String, List<LookedUpTerm>> map = new HashMap<String, List<LookedUpTerm>>(); InputStreamReader isr = null; BufferedReader br = null; Added: trunk/JLanguageTool/src/test/org/languagetool/synthesis/ManualSynthesizerTest.java =================================================================== --- trunk/JLanguageTool/src/test/org/languagetool/synthesis/ManualSynthesizerTest.java (rev 0) +++ trunk/JLanguageTool/src/test/org/languagetool/synthesis/ManualSynthesizerTest.java 2012-01-06 12:19:18 UTC (rev 6170) @@ -0,0 +1,80 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package org.languagetool.synthesis; + +import java.io.ByteArrayInputStream; +import java.io.IOException; + +import junit.framework.TestCase; + +/** + * Test class for {@link ManualSynthesizer}. + * + * @author Ionuț Păduraru + */ +public class ManualSynthesizerTest extends TestCase { + + private ManualSynthesizer synthesizer; + + @Override + protected void setUp() throws Exception { + super.setUp(); + final String data = + "# some test data\n" + + "InflectedForm11\tLemma1\tPOS1\n" + + "InflectedForm121\tLemma1\tPOS2\n" + + "InflectedForm122\tLemma1\tPOS2\n" + + "InflectedForm2\tLemma2\tPOS1\n" + ; + synthesizer = new ManualSynthesizer(new ByteArrayInputStream(data.getBytes("UTF-8"))); + } + + /** + * Lookup values that do not exist in the dictionary. + */ + public void testLookupNonExisting() throws IOException { + assertNull(synthesizer.lookup("", "")); + assertNull(synthesizer.lookup("", null)); + assertNull(synthesizer.lookup(null, "")); + assertNull(synthesizer.lookup(null, null)); + assertNull(synthesizer.lookup("NONE", "UNKNOWN")); + } + + /** + * Lookup values that do not exist in the dictionary but they do exist in different form (like other POS). + */ + public void testInvalidLookup() throws IOException { + assertNull(synthesizer.lookup("NONE", "POS1")); + assertNull(synthesizer.lookup("Lemma1", "UNKNOWN")); + assertNull(synthesizer.lookup("Lemma1", "POS.")); // no reg exp + assertNull(synthesizer.lookup("Lemma2", "POS2")); + } + + public void testValidLookup() throws IOException { + assertEquals("[InflectedForm11]", String.valueOf(synthesizer.lookup("Lemma1", "POS1"))); + assertEquals("[InflectedForm121, InflectedForm122]", String.valueOf(synthesizer.lookup("Lemma1", "POS2"))); + assertEquals("[InflectedForm2]", String.valueOf(synthesizer.lookup("Lemma2", "POS1"))); + } + + public void testCaseSensitive() throws IOException { + // lookup is case sensitive: + assertNull(synthesizer.lookup("LEmma1", "POS1")); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Ridiculously easy VDI. With Citrix VDI-in-a-Box, you don't need a complex infrastructure or vast IT resources to deliver seamless, secure access to virtual desktops. With this all-in-one solution, easily deploy virtual desktops for less than the cost of PCs and save 60% on VDI infrastructure costs. Try it free! http://p.sf.net/sfu/Citrix-VDIinabox _______________________________________________ Languagetool-cvs mailing list Languagetool-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/languagetool-cvs