Revision: 6171
http://languagetool.svn.sourceforge.net/languagetool/?rev=6171&view=rev
Author: archeus
Date: 2012-01-06 12:24:24 +0000 (Fri, 06 Jan 2012)
Log Message:
-----------
[ro] Added support for user dictionary so the *.dict doesn't need to be
recompiled for every dictionary change (both for tagger and synthetizer)
Modified Paths:
--------------
trunk/JLanguageTool/src/java/org/languagetool/synthesis/ro/RomanianSynthesizer.java
trunk/JLanguageTool/src/java/org/languagetool/tagging/ro/RomanianTagger.java
trunk/JLanguageTool/src/test/org/languagetool/synthesis/ro/RomanianSynthesizerTest.java
trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerDiacriticsTest.java
trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerTest.java
Added Paths:
-----------
trunk/JLanguageTool/src/resource/ro/added.txt
Modified:
trunk/JLanguageTool/src/java/org/languagetool/synthesis/ro/RomanianSynthesizer.java
===================================================================
---
trunk/JLanguageTool/src/java/org/languagetool/synthesis/ro/RomanianSynthesizer.java
2012-01-06 12:19:18 UTC (rev 6170)
+++
trunk/JLanguageTool/src/java/org/languagetool/synthesis/ro/RomanianSynthesizer.java
2012-01-06 12:24:24 UTC (rev 6171)
@@ -18,8 +18,12 @@
*/
package org.languagetool.synthesis.ro;
+import java.io.IOException;
+import java.util.List;
+
import org.languagetool.JLanguageTool;
import org.languagetool.synthesis.BaseSynthesizer;
+import org.languagetool.synthesis.ManualSynthesizer;
/**
* Romanian word form synthesizer. <br/>
@@ -30,11 +34,31 @@
public class RomanianSynthesizer extends BaseSynthesizer {
private static final String RESOURCE_FILENAME =
"/ro/romanian_synth.dict";
-
private static final String TAGS_FILE_NAME = "/ro/romanian_tags.txt";
+ private static final String USER_DICT_FILENAME = "/ro/added.txt";
+
+ private static ManualSynthesizer manualSynthesizer;
public RomanianSynthesizer() {
super(JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME,
JLanguageTool.getDataBroker().getResourceDir() +
TAGS_FILE_NAME);
}
+
+ @Override
+ protected void lookup(String lemma, String posTag, List<String>
results) {
+ super.lookup(lemma, posTag, results);
+ // add words that are missing from the romanian_synth.dict file
+ final List<String> manualForms =
manualSynthesizer.lookup(lemma, posTag);
+ if (manualForms != null) {
+ results.addAll(manualForms);
+ }
+ }
+
+ @Override
+ protected void initSynthesizer() throws IOException {
+ super.initSynthesizer();
+ if (manualSynthesizer == null) {
+ manualSynthesizer = new
ManualSynthesizer(JLanguageTool.getDataBroker().getFromResourceDirAsStream(USER_DICT_FILENAME));
+ }
+ }
}
Modified:
trunk/JLanguageTool/src/java/org/languagetool/tagging/ro/RomanianTagger.java
===================================================================
---
trunk/JLanguageTool/src/java/org/languagetool/tagging/ro/RomanianTagger.java
2012-01-06 12:19:18 UTC (rev 6170)
+++
trunk/JLanguageTool/src/java/org/languagetool/tagging/ro/RomanianTagger.java
2012-01-06 12:24:24 UTC (rev 6171)
@@ -33,6 +33,7 @@
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.tagging.BaseTagger;
+import org.languagetool.tagging.ManualTagger;
/**
* Romanian Part-of-speech tagger
@@ -42,8 +43,10 @@
public class RomanianTagger extends BaseTagger {
private String RESOURCE_FILENAME = "/ro/romanian.dict";
+ private String USER_DICT_FILENAME = "/ro/added.txt";
private IStemmer morfologik;
+ private ManualTagger manualTagger;
private static final Locale roLocale = new Locale("ro");
@Override
@@ -56,17 +59,16 @@
setLocale(roLocale);
}
- public RomanianTagger(final String fileName) {
+ public RomanianTagger(final String dictFileName, final String
userDictFileName) {
super();
- RESOURCE_FILENAME = fileName;
+ RESOURCE_FILENAME = dictFileName;
+ USER_DICT_FILENAME = userDictFileName;
setLocale(roLocale);
}
@Override
public final List<AnalyzedTokenReadings> tag(
final List<String> sentenceTokens) throws IOException {
- List<WordData> taggerTokens;
-
final List<AnalyzedTokenReadings> tokenReadings = new
ArrayList<AnalyzedTokenReadings>();
int pos = 0;
// caching Lametyzator instance - lazy init
@@ -74,10 +76,15 @@
final URL url =
JLanguageTool.getDataBroker().getFromResourceDirAsUrl(RESOURCE_FILENAME);
morfologik = new DictionaryLookup(Dictionary.read(url));
}
+ if (manualTagger == null && USER_DICT_FILENAME != null) {
+ manualTagger = new
ManualTagger(JLanguageTool.getDataBroker().getFromResourceDirAsStream(USER_DICT_FILENAME));
+ }
+
for (final String word : sentenceTokens) {
final List<AnalyzedToken> l = new ArrayList<AnalyzedToken>();
- taggerTokens = morfologik.lookup(word.toLowerCase(roLocale));
+ final String lowerCaseWord = word.toLowerCase(roLocale);
+ final List<WordData> taggerTokens = morfologik.lookup(lowerCaseWord);
if (taggerTokens != null) {
for (WordData wd : taggerTokens) {
final String[] tagsArr = wd.getStem().toString().split("\\+");
@@ -87,8 +94,16 @@
}
}
}
+ if (manualTagger != null) { // add user tags, if any
+ final String[] manualTags = manualTagger.lookup(lowerCaseWord);
+ if (manualTags != null) {
+ for (int i = 0; i < manualTags.length/2; i=i+2) {
+ l.add(new AnalyzedToken(word, manualTags[i+1],
manualTags[i]));
+ }
+ }
+ }
- if (taggerTokens == null || taggerTokens.isEmpty()) {
+ if (l.isEmpty()) {
l.add(new AnalyzedToken(word, null, null));
}
tokenReadings.add(new AnalyzedTokenReadings(l
Added: trunk/JLanguageTool/src/resource/ro/added.txt
===================================================================
--- trunk/JLanguageTool/src/resource/ro/added.txt
(rev 0)
+++ trunk/JLanguageTool/src/resource/ro/added.txt 2012-01-06 12:24:24 UTC
(rev 6171)
@@ -0,0 +1,32 @@
+# [RO]
+# Dicționar de etichete POS (part of speach / părți de vorbire) ce vine în
completarea dicționarelor romanian.dict și romanian_synth.dict
+# Codare fișier: UTF-8
+# Format: <formă flexionată> <formă bază> <etichetă POS> (separate de TAB)
+# Pentru detalii priving etichetele vezi fișierul coduri.html
+
+# [EN]
+# A POS tag dictionary that's used additionally to romanian.dict and
romanian_synth.dict
+# File Encoding: UTF-8
+# Format: fullform baseform postags (tab separated)
+# For POS tag information see file coduri.html
+
+
+
+# Format
+# <formă flexionată> <formă bază> <etichetă POS>
+
+# [a configura]
+# conjunctiv
+configurez configura V0s1000cz0
+configurezi configura V0s2000cz0
+configurăm configura V0p1000cz0
+configurați configura V0p2000cz0
+## configureze (sg+pl) already exists in .dict
+# indicativ
+configuram configura V0s1000ii0
+configuram configura V0p1000ii0
+## de adăugat formele pentru infinitiv și participiu
+
+# [a enumera]
+# in .dict este greșit: enumăm în loc de enumăr
+enumăr enumera V0s1000cz0
Modified:
trunk/JLanguageTool/src/test/org/languagetool/synthesis/ro/RomanianSynthesizerTest.java
===================================================================
---
trunk/JLanguageTool/src/test/org/languagetool/synthesis/ro/RomanianSynthesizerTest.java
2012-01-06 12:19:18 UTC (rev 6170)
+++
trunk/JLanguageTool/src/test/org/languagetool/synthesis/ro/RomanianSynthesizerTest.java
2012-01-06 12:24:24 UTC (rev 6171)
@@ -77,6 +77,15 @@
dummyToken("legătură"), "Sfs3aac000")));
assertEquals("[legătură]", Arrays.toString(synth.synthesize(
dummyToken("legătură"), "Sfs3anc000")));
+
+ // user data (/ro/added.txt)
+ assertEquals("[configurați]", Arrays.toString(synth.synthesize(
+ dummyToken("configura"), "V0p2000cz0"))); // no
reg exp
+ assertEquals("[configurați, configurezi]",
Arrays.toString(synth.synthesize(
+ dummyToken("configura"), "V0.2000cz0", true)));
// using reg exp
+ // assertEquals("[enumăr]",
Arrays.toString(synth.synthesize(
+ // dummyToken("enumera"), "V0s1000cz0")));
+ // commented out as "a enumera" contains an extra form (.dict
spelling error - "enumăm" instead of "enumăr"). To be fixed.
}
Modified:
trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerDiacriticsTest.java
===================================================================
---
trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerDiacriticsTest.java
2012-01-06 12:19:18 UTC (rev 6170)
+++
trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerDiacriticsTest.java
2012-01-06 12:24:24 UTC (rev 6171)
@@ -57,7 +57,7 @@
@Override
protected RomanianTagger createTagger() {
RomanianTagger res = new RomanianTagger(
- "/ro/test_diacritics.dict");
+ "/ro/test_diacritics.dict", null);
return res;
}
Modified:
trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerTest.java
===================================================================
---
trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerTest.java
2012-01-06 12:19:18 UTC (rev 6170)
+++
trunk/JLanguageTool/src/test/org/languagetool/tagging/ro/RomanianTaggerTest.java
2012-01-06 12:24:24 UTC (rev 6171)
@@ -74,6 +74,15 @@
}
/**
+ * Test for entries in used dictionary.
+ * @throws Exception
+ */
+ public void testTagger_UserDict() throws Exception {
+ assertHasLemmaAndPos("configurați", "configura", "V0p2000cz0");
// de adăugat formele pentru infinitiv și participiu
+ // to be updated when the words from added.txt are moved to
romanian.dict
+ }
+
+ /**
* the big picture: test is tagger performs well with a sentence
*
* @author ionuț păduraru
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Ridiculously easy VDI. With Citrix VDI-in-a-Box, you don't need a complex
infrastructure or vast IT resources to deliver seamless, secure access to
virtual desktops. With this all-in-one solution, easily deploy virtual
desktops for less than the cost of PCs and save 60% on VDI infrastructure
costs. Try it free! http://p.sf.net/sfu/Citrix-VDIinabox
_______________________________________________
Languagetool-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs