Revision: 6171 http://languagetool.svn.sourceforge.net/languagetool/?rev=6171&view=rev Author: archeus Date: 2012-01-06 12:24:24 +0000 (Fri, 06 Jan 2012) Log Message: ----------- [ro] Added support for user dictionary so the *.dict doesn't need to be recompiled for every dictionary change (both for tagger and synthetizer)
Modified Paths: -------------- trunk/JLanguageTool/src/java/org/languagetool/synthesis/ro/RomanianSynthesizer.java trunk/JLanguageTool/src/java/org/languagetool/tagging/ro/RomanianTagger.java trunk/JLanguageTool/src/test/org/languagetool/synthesis/ro/RomanianSynthesizerTest.java trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerDiacriticsTest.java trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerTest.java Added Paths: ----------- trunk/JLanguageTool/src/resource/ro/added.txt Modified: trunk/JLanguageTool/src/java/org/languagetool/synthesis/ro/RomanianSynthesizer.java =================================================================== --- trunk/JLanguageTool/src/java/org/languagetool/synthesis/ro/RomanianSynthesizer.java 2012-01-06 12:19:18 UTC (rev 6170) +++ trunk/JLanguageTool/src/java/org/languagetool/synthesis/ro/RomanianSynthesizer.java 2012-01-06 12:24:24 UTC (rev 6171) @@ -18,8 +18,12 @@ */ package org.languagetool.synthesis.ro; +import java.io.IOException; +import java.util.List; + import org.languagetool.JLanguageTool; import org.languagetool.synthesis.BaseSynthesizer; +import org.languagetool.synthesis.ManualSynthesizer; /** * Romanian word form synthesizer. <br/> @@ -30,11 +34,31 @@ public class RomanianSynthesizer extends BaseSynthesizer { private static final String RESOURCE_FILENAME = "/ro/romanian_synth.dict"; - private static final String TAGS_FILE_NAME = "/ro/romanian_tags.txt"; + private static final String USER_DICT_FILENAME = "/ro/added.txt"; + + private static ManualSynthesizer manualSynthesizer; public RomanianSynthesizer() { super(JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME, JLanguageTool.getDataBroker().getResourceDir() + TAGS_FILE_NAME); } + + @Override + protected void lookup(String lemma, String posTag, List<String> results) { + super.lookup(lemma, posTag, results); + // add words that are missing from the romanian_synth.dict file + final List<String> manualForms = manualSynthesizer.lookup(lemma, posTag); + if (manualForms != null) { + results.addAll(manualForms); + } + } + + @Override + protected void initSynthesizer() throws IOException { + super.initSynthesizer(); + if (manualSynthesizer == null) { + manualSynthesizer = new ManualSynthesizer(JLanguageTool.getDataBroker().getFromResourceDirAsStream(USER_DICT_FILENAME)); + } + } } Modified: trunk/JLanguageTool/src/java/org/languagetool/tagging/ro/RomanianTagger.java =================================================================== --- trunk/JLanguageTool/src/java/org/languagetool/tagging/ro/RomanianTagger.java 2012-01-06 12:19:18 UTC (rev 6170) +++ trunk/JLanguageTool/src/java/org/languagetool/tagging/ro/RomanianTagger.java 2012-01-06 12:24:24 UTC (rev 6171) @@ -33,6 +33,7 @@ import org.languagetool.AnalyzedTokenReadings; import org.languagetool.JLanguageTool; import org.languagetool.tagging.BaseTagger; +import org.languagetool.tagging.ManualTagger; /** * Romanian Part-of-speech tagger @@ -42,8 +43,10 @@ public class RomanianTagger extends BaseTagger { private String RESOURCE_FILENAME = "/ro/romanian.dict"; + private String USER_DICT_FILENAME = "/ro/added.txt"; private IStemmer morfologik; + private ManualTagger manualTagger; private static final Locale roLocale = new Locale("ro"); @Override @@ -56,17 +59,16 @@ setLocale(roLocale); } - public RomanianTagger(final String fileName) { + public RomanianTagger(final String dictFileName, final String userDictFileName) { super(); - RESOURCE_FILENAME = fileName; + RESOURCE_FILENAME = dictFileName; + USER_DICT_FILENAME = userDictFileName; setLocale(roLocale); } @Override public final List<AnalyzedTokenReadings> tag( final List<String> sentenceTokens) throws IOException { - List<WordData> taggerTokens; - final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<AnalyzedTokenReadings>(); int pos = 0; // caching Lametyzator instance - lazy init @@ -74,10 +76,15 @@ final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(RESOURCE_FILENAME); morfologik = new DictionaryLookup(Dictionary.read(url)); } + if (manualTagger == null && USER_DICT_FILENAME != null) { + manualTagger = new ManualTagger(JLanguageTool.getDataBroker().getFromResourceDirAsStream(USER_DICT_FILENAME)); + } + for (final String word : sentenceTokens) { final List<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); - taggerTokens = morfologik.lookup(word.toLowerCase(roLocale)); + final String lowerCaseWord = word.toLowerCase(roLocale); + final List<WordData> taggerTokens = morfologik.lookup(lowerCaseWord); if (taggerTokens != null) { for (WordData wd : taggerTokens) { final String[] tagsArr = wd.getStem().toString().split("\\+"); @@ -87,8 +94,16 @@ } } } + if (manualTagger != null) { // add user tags, if any + final String[] manualTags = manualTagger.lookup(lowerCaseWord); + if (manualTags != null) { + for (int i = 0; i < manualTags.length/2; i=i+2) { + l.add(new AnalyzedToken(word, manualTags[i+1], manualTags[i])); + } + } + } - if (taggerTokens == null || taggerTokens.isEmpty()) { + if (l.isEmpty()) { l.add(new AnalyzedToken(word, null, null)); } tokenReadings.add(new AnalyzedTokenReadings(l Added: trunk/JLanguageTool/src/resource/ro/added.txt =================================================================== --- trunk/JLanguageTool/src/resource/ro/added.txt (rev 0) +++ trunk/JLanguageTool/src/resource/ro/added.txt 2012-01-06 12:24:24 UTC (rev 6171) @@ -0,0 +1,32 @@ +# [RO] +# Dicționar de etichete POS (part of speach / părți de vorbire) ce vine în completarea dicționarelor romanian.dict și romanian_synth.dict +# Codare fișier: UTF-8 +# Format: <formă flexionată> <formă bază> <etichetă POS> (separate de TAB) +# Pentru detalii priving etichetele vezi fișierul coduri.html + +# [EN] +# A POS tag dictionary that's used additionally to romanian.dict and romanian_synth.dict +# File Encoding: UTF-8 +# Format: fullform baseform postags (tab separated) +# For POS tag information see file coduri.html + + + +# Format +# <formă flexionată> <formă bază> <etichetă POS> + +# [a configura] +# conjunctiv +configurez configura V0s1000cz0 +configurezi configura V0s2000cz0 +configurăm configura V0p1000cz0 +configurați configura V0p2000cz0 +## configureze (sg+pl) already exists in .dict +# indicativ +configuram configura V0s1000ii0 +configuram configura V0p1000ii0 +## de adăugat formele pentru infinitiv și participiu + +# [a enumera] +# in .dict este greșit: enumăm în loc de enumăr +enumăr enumera V0s1000cz0 Modified: trunk/JLanguageTool/src/test/org/languagetool/synthesis/ro/RomanianSynthesizerTest.java =================================================================== --- trunk/JLanguageTool/src/test/org/languagetool/synthesis/ro/RomanianSynthesizerTest.java 2012-01-06 12:19:18 UTC (rev 6170) +++ trunk/JLanguageTool/src/test/org/languagetool/synthesis/ro/RomanianSynthesizerTest.java 2012-01-06 12:24:24 UTC (rev 6171) @@ -77,6 +77,15 @@ dummyToken("legătură"), "Sfs3aac000"))); assertEquals("[legătură]", Arrays.toString(synth.synthesize( dummyToken("legătură"), "Sfs3anc000"))); + + // user data (/ro/added.txt) + assertEquals("[configurați]", Arrays.toString(synth.synthesize( + dummyToken("configura"), "V0p2000cz0"))); // no reg exp + assertEquals("[configurați, configurezi]", Arrays.toString(synth.synthesize( + dummyToken("configura"), "V0.2000cz0", true))); // using reg exp + // assertEquals("[enumăr]", Arrays.toString(synth.synthesize( + // dummyToken("enumera"), "V0s1000cz0"))); + // commented out as "a enumera" contains an extra form (.dict spelling error - "enumăm" instead of "enumăr"). To be fixed. } Modified: trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerDiacriticsTest.java =================================================================== --- trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerDiacriticsTest.java 2012-01-06 12:19:18 UTC (rev 6170) +++ trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerDiacriticsTest.java 2012-01-06 12:24:24 UTC (rev 6171) @@ -57,7 +57,7 @@ @Override protected RomanianTagger createTagger() { RomanianTagger res = new RomanianTagger( - "/ro/test_diacritics.dict"); + "/ro/test_diacritics.dict", null); return res; } Modified: trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerTest.java =================================================================== --- trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerTest.java 2012-01-06 12:19:18 UTC (rev 6170) +++ trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerTest.java 2012-01-06 12:24:24 UTC (rev 6171) @@ -74,6 +74,15 @@ } /** + * Test for entries in used dictionary. + * @throws Exception + */ + public void testTagger_UserDict() throws Exception { + assertHasLemmaAndPos("configurați", "configura", "V0p2000cz0"); // de adăugat formele pentru infinitiv și participiu + // to be updated when the words from added.txt are moved to romanian.dict + } + + /** * the big picture: test is tagger performs well with a sentence * * @author ionuț păduraru This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Ridiculously easy VDI. With Citrix VDI-in-a-Box, you don't need a complex infrastructure or vast IT resources to deliver seamless, secure access to virtual desktops. With this all-in-one solution, easily deploy virtual desktops for less than the cost of PCs and save 60% on VDI infrastructure costs. Try it free! http://p.sf.net/sfu/Citrix-VDIinabox _______________________________________________ Languagetool-cvs mailing list Languagetool-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/languagetool-cvs