AbstractSimpleReplaceRule: look through multiple lemmas

Andriy Rysin Mon, 12 Sep 2016 15:52:36 -0700

I just got a report that for Ukrainian the simple replace rule does not
pick the right lemma for the replacements. E.g. if the token is not found
in replace list the lemmas are searched and first found is used for
replacement list. If token has multiple lemmas this may not be the right
one to pick.
I've adjusted the code to iterate over all lemmas and combine the (unique)
list.


All language tests passed but as it's a core rule used by many I'd like to
do a review before I push.

The patch is attached.

Thanks
Andriy

diff --git a/languagetool-core/src/main/java/org/languagetool/rules/AbstractSimpleReplaceRule.java b/languagetool-core/src/main/java/org/languagetool/rules/AbstractSimpleReplaceRule.java
index 7ea2b3a..3aa979c 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/AbstractSimpleReplaceRule.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/AbstractSimpleReplaceRule.java
@@ -18,15 +18,20 @@
  */
 package org.languagetool.rules;
 
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.ResourceBundle;
+import java.util.stream.Collectors;
+
 import org.apache.commons.lang.StringUtils;
 import org.languagetool.AnalyzedSentence;
 import org.languagetool.AnalyzedToken;
 import org.languagetool.AnalyzedTokenReadings;
 import org.languagetool.tools.StringTools;
 
-import java.io.IOException;
-import java.util.*;
-
 /**
  * A rule that matches words which should not be used and suggests
  * correct ones instead. Loads the relevant words from
@@ -106,6 +111,10 @@ public abstract class AbstractSimpleReplaceRule extends Rule {
 
     for (AnalyzedTokenReadings tokenReadings : tokens) {
 
+      // short for SENT_START
+      if( StringUtils.isEmpty(tokenReadings.getToken()) )
+        continue;
+
       //this rule is used mostly for spelling, so ignore both immunized
       // and speller-ignorable rules
       if (tokenReadings.isImmunized() || tokenReadings.isIgnoredBySpeller()) {
@@ -118,23 +127,29 @@ public abstract class AbstractSimpleReplaceRule extends Rule {
       }
       String tokenString = cleanup(originalTokenStr);
 
-      if (!getWrongWords().containsKey(tokenString) && checkLemmas) {
+      // try first with the original word, then with the all lower-case version
+      List<String> possibleReplacements = getWrongWords().get(originalTokenStr);
+      if (possibleReplacements == null) {
+        possibleReplacements = getWrongWords().get(tokenString);
+      }
+
+      if (possibleReplacements == null && checkLemmas) {
+        possibleReplacements = new ArrayList<>();
+
+        ArrayList<String> lemmas = new ArrayList<>();
         for (AnalyzedToken analyzedToken : tokenReadings.getReadings()) {
           String lemma = analyzedToken.getLemma();
-          if (lemma != null) {
-            lemma = cleanup(lemma);
-            if (getWrongWords().containsKey(lemma)) {
-              tokenString = lemma;
-              break;
-            }
+          if (lemma != null && getWrongWords().containsKey(lemma) && ! lemmas.contains(lemma) ) {
+            lemmas.add(cleanup(lemma));
           }
         }
-      }
 
-      // try first with the original word, then with the all lower-case version
-      List<String> possibleReplacements = getWrongWords().get(originalTokenStr);
-      if (possibleReplacements == null) {
-        possibleReplacements = getWrongWords().get(tokenString);
+        for (String lemma: lemmas) {
+          List<String> replacements = getWrongWords().get(lemma);
+          possibleReplacements.addAll(replacements);
+        }
+
+        possibleReplacements = possibleReplacements.stream().distinct().collect(Collectors.toList());
       }
 
       if (possibleReplacements != null && possibleReplacements.size() > 0) {
diff --git a/languagetool-language-modules/uk/src/test/java/org/languagetool/rules/uk/SimpleReplaceRuleTest.java b/languagetool-language-modules/uk/src/test/java/org/languagetool/rules/uk/SimpleReplaceRuleTest.java
index 1bb8a9f..3b444af 100644
--- a/languagetool-language-modules/uk/src/test/java/org/languagetool/rules/uk/SimpleReplaceRuleTest.java
+++ b/languagetool-language-modules/uk/src/test/java/org/languagetool/rules/uk/SimpleReplaceRuleTest.java
@@ -70,5 +70,10 @@ public class SimpleReplaceRuleTest {
 
     matches = rule.match(langTool.getAnalyzedSentence("щедроти"));
     assertEquals(0, matches.length);
+
+    matches = rule.match(langTool.getAnalyzedSentence("Задля благоустрою."));
+    assertEquals(1, matches.length);
+    assertEquals(Arrays.asList("упорядковувати", "упорядкувати", "упоряджати", "упорядити", "доброустрій", "порядок", "добрий лад"), matches[0].getSuggestedReplacements());
+
   }
 }

------------------------------------------------------------------------------
What NetFlow Analyzer can do for you? Monitors network bandwidth and traffic
patterns at an interface-level. Reveals which users, apps, and protocols are 
consuming the most bandwidth. Provides multi-vendor support for NetFlow, 
J-Flow, sFlow and other flows. Make informed decisions using capacity 
planning reports. http://sdm.link/zohodev2dev

_______________________________________________
Languagetool-devel mailing list
Languagetool-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/languagetool-devel

AbstractSimpleReplaceRule: look through multiple lemmas

Reply via email to