[LanguageTool] SF.net SVN: languagetool:[7798] trunk/JLanguageTool/src

dnaber Sun, 05 Aug 2012 12:12:42 -0700

Revision: 7798
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=7798&view=rev
Author:   dnaber
Date:     2012-08-05 19:12:30 +0000 (Sun, 05 Aug 2012)
Log Message:
-----------
when adding rules programmatically, extract their suggestions to ignore them 
for spell check; extending extractor with a main method that extracts all 
ignore tokens for all languages


Modified Paths:
--------------
    trunk/JLanguageTool/src/java/org/languagetool/JLanguageTool.java
    
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SpellingCheckRule.java
    
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SuggestionExtractor.java
    
trunk/JLanguageTool/src/test/org/languagetool/rules/spelling/SpellingCheckRuleTest.java

Modified: trunk/JLanguageTool/src/java/org/languagetool/JLanguageTool.java
===================================================================
--- trunk/JLanguageTool/src/java/org/languagetool/JLanguageTool.java    
2012-08-05 18:05:43 UTC (rev 7797)
+++ trunk/JLanguageTool/src/java/org/languagetool/JLanguageTool.java    
2012-08-05 19:12:30 UTC (rev 7798)
@@ -50,6 +50,8 @@
 import org.languagetool.rules.patterns.FalseFriendRuleLoader;
 import org.languagetool.rules.patterns.PatternRule;
 import org.languagetool.rules.patterns.PatternRuleLoader;
+import org.languagetool.rules.spelling.SpellingCheckRule;
+import org.languagetool.rules.spelling.SuggestionExtractor;
 import org.languagetool.tagging.Tagger;
 import org.languagetool.tagging.disambiguation.Disambiguator;
 import org.languagetool.tokenizers.Tokenizer;
@@ -388,28 +390,65 @@
    */
   public void addRule(final Rule rule) {
     userRules.add(rule);
+    // TODO: not on first start - too slow:
+    final SuggestionExtractor extractor = new SuggestionExtractor(language);
+    final List<String> suggestionTokens = extractor.getSuggestionTokens(rule);
+    final List<Rule> allActiveRules = getAllActiveRules();
+    addIgnoreWords(suggestionTokens, allActiveRules);
   }
 
+  private void addIgnoreWords(List<String> suggestionTokens, List<Rule> 
allActiveRules) {
+    for (Rule activeRule : allActiveRules) {
+      if (activeRule instanceof SpellingCheckRule) {
+        ((SpellingCheckRule)activeRule).addIgnoreTokens(suggestionTokens);
+      }
+    }
+  }
+
+  private void setIgnoreWords(List<String> suggestionTokens, List<Rule> 
allActiveRules) {
+    for (Rule activeRule : allActiveRules) {
+      if (activeRule instanceof SpellingCheckRule) {
+        ((SpellingCheckRule)activeRule).resetIgnoreTokens();
+        ((SpellingCheckRule)activeRule).addIgnoreTokens(suggestionTokens);
+      }
+    }
+  }
+
   /**
    * Disable a given rule so {@link #check(String)} won't use it.
    * 
-   * @param ruleId
-   *          the id of the rule to disable
+   * @param ruleId the id of the rule to disable - no error will be given if 
the id does not exist
    */
   public void disableRule(final String ruleId) {
-    // TODO: check if such a rule exists
     disabledRules.add(ruleId);
+    reInitSpellCheckIgnoreWords();
   }
 
+  private void reInitSpellCheckIgnoreWords() {
+    final List<Rule> allActiveRules = getAllActiveRules();
+    final List<String> ignoreTokens = getAllIgnoreWords(allActiveRules);
+    setIgnoreWords(ignoreTokens, allActiveRules);
+  }
+
+  private List<String> getAllIgnoreWords(List<Rule> allActiveRules) {
+    final List<String> suggestionTokens = new ArrayList<String>();
+    for (Rule activeRule : allActiveRules) {
+      if (activeRule instanceof PatternRule) {
+        final SuggestionExtractor extractor = new 
SuggestionExtractor(language);
+        suggestionTokens.addAll(extractor.getSuggestionTokens(activeRule));
+      }
+    }
+    return suggestionTokens;
+  }
+
   /**
    * Disable a given category so {@link #check(String)} won't use it.
    * 
-   * @param categoryName
-   *          the id of the category to disable
+   * @param categoryName the id of the category to disable - no error will be 
given if the id does not exist
    */
   public void disableCategory(final String categoryName) {
-    // TODO: check if such a rule exists
     disabledCategories.add(categoryName);
+    reInitSpellCheckIgnoreWords();
   }
 
   /**

Modified: 
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SpellingCheckRule.java
===================================================================
--- 
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SpellingCheckRule.java
 2012-08-05 18:05:43 UTC (rev 7797)
+++ 
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SpellingCheckRule.java
 2012-08-05 19:12:30 UTC (rev 7798)
@@ -63,6 +63,25 @@
   public void reset() {
   }
 
+  /**
+   * Add the given words to the list of words to be ignored during spell check.
+   */
+  public void addIgnoreTokens(List<String> tokens) {
+    wordsToBeIgnored.addAll(tokens);
+  }
+
+  /**
+   * Reset the list of words to be ignored, by re-loading it from the 
"ignore.txt" file.
+   */
+  public void resetIgnoreTokens() {
+    wordsToBeIgnored.clear();
+    try {
+      init();
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
   protected boolean ignoreWord(String word) throws IOException {
     // TODO?: this is needed at least for German as Hunspell tokenization 
includes the dot:
     final String cleanWord = word.endsWith(".") ? word.substring(0, 
word.length() - 1) : word;
@@ -71,7 +90,6 @@
 
   protected void init() throws IOException {
     loadFileIfExists(language.getShortName() + SPELLING_IGNORE_FILE);
-    loadFileIfExists(language.getShortNameWithVariant() + 
SPELLING_IGNORE_FILE);
   }
 
   private void loadFileIfExists(String filename) throws IOException {

Modified: 
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SuggestionExtractor.java
===================================================================
--- 
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SuggestionExtractor.java
       2012-08-05 18:05:43 UTC (rev 7797)
+++ 
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SuggestionExtractor.java
       2012-08-05 19:12:30 UTC (rev 7798)
@@ -18,19 +18,22 @@
  */
 package org.languagetool.rules.spelling;
 
+import org.languagetool.JLanguageTool;
 import org.languagetool.Language;
 import org.languagetool.rules.Rule;
 import org.languagetool.rules.patterns.PatternRule;
 
-import java.util.ArrayList;
-import java.util.List;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 /**
  * Extract tokens from suggestions.
  */
-class SuggestionExtractor {
+public class SuggestionExtractor {
 
   private final static Pattern SUGGESTION_PATTERN = 
Pattern.compile("<suggestion.*?>(.*?)</suggestion>");
   private final static Pattern BACK_REFERENCE_PATTERN = Pattern.compile("\\\\" 
+ "\\d+");
@@ -43,7 +46,7 @@
   /**
    * Get the tokens of simple suggestions, i.e. those that don't use back 
references.
    */
-  List<String> getSuggestionTokens(Rule rule) {
+  public List<String> getSuggestionTokens(Rule rule) {
     final List<String> wordsToBeIgnored = new ArrayList<String>();
     if (rule instanceof PatternRule) {
       final PatternRule patternRule = (PatternRule) rule;
@@ -86,9 +89,94 @@
     final List<String> tokens = new ArrayList<String>();
     for (String suggestion : suggestions) {
       final List<String> suggestionTokens = 
language.getWordTokenizer().tokenize(suggestion);
-      tokens.addAll(suggestionTokens);
+      for (String suggestionToken : suggestionTokens) {
+        if (!suggestionToken.trim().isEmpty()) {
+          tokens.add(suggestionToken);
+        }
+      }
     }
     return tokens;
   }
 
+  private void writeIgnoreTokensForLanguages() throws IOException {
+    final Map<Language, Set<String>> map = getLanguageToIgnoreTokensMapping();
+    for (Map.Entry<Language, Set<String>> entry : map.entrySet()) {
+      final Language language = entry.getKey();
+      final File langDir = getLanguageDir(language);
+      final File hunspellDir = new File(langDir, "hunspell");
+      if (!hunspellDir.exists()) {
+        System.out.println("No directory " + hunspellDir + " found, ignoring 
language " + language);
+        continue;
+      }
+      final File ignoreFile = new File(hunspellDir, "ignore.txt");
+      final Set<String> tokens = entry.getValue();
+      if (tokens.size() > 0) {
+        final FileWriter writer = new FileWriter(ignoreFile);
+        try {
+          writeIntro(writer, language);
+          for (String token : tokens) {
+            writer.write(token);
+            writer.write("\n");
+          }
+        } finally {
+          writer.close();
+        }
+        System.out.println("Wrote " + tokens.size() + " words to " + 
ignoreFile);
+      }
+    }
+  }
+
+  private void writeIntro(FileWriter writer, Language language) throws 
IOException {
+    writer.write("# words to be ignored by the spellchecker (auto-generated " 
+ new Date() + ")\n");
+    writeArtificialTestCaseItems(writer, language);
+  }
+
+  private void writeArtificialTestCaseItems(FileWriter writer, Language 
language) throws IOException {
+    if (language == Language.AMERICAN_ENGLISH) {
+      writer.write("anArtificialTestWordForLanguageTool\n");
+    } else if (language == Language.GERMANY_GERMAN) {
+      writer.write("einPseudoWortFürLanguageToolTests\n");
+    }
+  }
+
+  /**
+   * We don't support sub-language resources yet, so collect all variants for 
one language.
+   */
+  private Map<Language, Set<String>> getLanguageToIgnoreTokensMapping() throws 
IOException {
+    final Map<Language, Set<String>> langToIgnoreTokens = new 
HashMap<Language, Set<String>>();
+    for (Language lang : Language.REAL_LANGUAGES) {
+      final Set<String> suggestionTokens = new HashSet<String>();
+      final JLanguageTool languageTool = new JLanguageTool(lang);
+      languageTool.activateDefaultPatternRules();
+      final List<Rule> rules = languageTool.getAllRules();
+      for (Rule rule : rules) {
+        suggestionTokens.addAll(getSuggestionTokens(rule));
+      }
+      final Language noVariantLanguage = lang.getDefaultVariant() == null ? 
lang : lang.getDefaultVariant();
+      final Set<String> existingTokens = 
langToIgnoreTokens.get(noVariantLanguage);
+      if (existingTokens != null) {
+        existingTokens.addAll(suggestionTokens);
+      } else {
+        langToIgnoreTokens.put(noVariantLanguage, suggestionTokens);
+      }
+    }
+    return langToIgnoreTokens;
+  }
+
+  private File getLanguageDir(Language language) {
+    final File dir = new File("resource", language.getShortName());
+    if (dir.exists()) {
+      return dir;
+    } else {
+      // during development (in SVN):
+      final File sourceDir = new File("src", "resource");
+      return new File(sourceDir, language.getShortName());
+    }
+  }
+
+  public static void main(String[] args) throws IOException {
+    final SuggestionExtractor extractor = new SuggestionExtractor(/*not 
used:*/Language.ENGLISH);
+    extractor.writeIgnoreTokensForLanguages();
+  }
+
 }

Modified: 
trunk/JLanguageTool/src/test/org/languagetool/rules/spelling/SpellingCheckRuleTest.java
===================================================================
--- 
trunk/JLanguageTool/src/test/org/languagetool/rules/spelling/SpellingCheckRuleTest.java
     2012-08-05 18:05:43 UTC (rev 7797)
+++ 
trunk/JLanguageTool/src/test/org/languagetool/rules/spelling/SpellingCheckRuleTest.java
     2012-08-05 19:12:30 UTC (rev 7798)
@@ -21,9 +21,15 @@
 import junit.framework.TestCase;
 import org.languagetool.JLanguageTool;
 import org.languagetool.Language;
+import org.languagetool.TestTools;
 import org.languagetool.rules.RuleMatch;
+import org.languagetool.rules.en.MorfologikAmericanSpellerRule;
+import org.languagetool.rules.patterns.Element;
+import org.languagetool.rules.patterns.PatternRule;
+import org.languagetool.rules.spelling.hunspell.HunspellNoSuggestionRule;
 
 import java.io.IOException;
+import java.util.Collections;
 import java.util.List;
 
 public class SpellingCheckRuleTest extends TestCase {
@@ -50,4 +56,44 @@
     assertEquals("MORFOLOGIK_RULE_EN_US", matches2.get(0).getRule().getId());
   }
 
+  public void testIgnoreSuggestionsWithDynamicHunspellRule() throws 
IOException {
+    final JLanguageTool langTool = new JLanguageTool(Language.GERMANY_GERMAN);
+    final SpellingCheckRule rule = new 
HunspellNoSuggestionRule(TestTools.getEnglishMessages(), 
Language.GERMANY_GERMAN);
+    langTool.addRule(rule);
+    final List<RuleMatch> matches = langTool.check("Das ist ein Tibbfehla.");
+    assertEquals(1, matches.size());
+    assertEquals(HunspellNoSuggestionRule.RULE_ID, 
matches.get(0).getRule().getId());
+
+    final PatternRule ruleWithSuggestion = new PatternRule("TEST_ID", 
Language.GERMANY_GERMAN,
+            Collections.<Element>emptyList(), "description",
+            "Meinten Sie <suggestion>Tibbfehla</suggestion>?", null);
+    langTool.addRule(ruleWithSuggestion);
+    final List<RuleMatch> matches2 = langTool.check("Das ist ein Tibbfehla.");
+    assertEquals(0, matches2.size());   // no error anymore, as this is a 
suggestion
+
+    langTool.disableRule("TEST_ID");
+    final List<RuleMatch> matches3 = langTool.check("Das ist ein Tibbfehla.");
+    assertEquals(1, matches3.size());   // an error again
+  }
+
+  public void testIgnoreSuggestionsWithDynamicMorfologikRule() throws 
IOException {
+    final JLanguageTool langTool = new 
JLanguageTool(Language.AMERICAN_ENGLISH);
+    final SpellingCheckRule rule = new 
MorfologikAmericanSpellerRule(TestTools.getEnglishMessages(), 
Language.AMERICAN_ENGLISH);
+    langTool.addRule(rule);
+    final List<RuleMatch> matches = langTool.check("This is a typoh.");
+    assertEquals(1, matches.size());
+    assertEquals(MorfologikAmericanSpellerRule.RULE_ID, 
matches.get(0).getRule().getId());
+
+    final PatternRule ruleWithSuggestion = new PatternRule("TEST_ID", 
Language.AMERICAN_ENGLISH,
+            Collections.<Element>emptyList(), "description",
+            "Did you mean <suggestion>typoh</suggestion>?", null);
+    langTool.addRule(ruleWithSuggestion);
+    final List<RuleMatch> matches2 = langTool.check("This is a typoh.");
+    assertEquals(0, matches2.size());   // no error anymore, as this is a 
suggestion
+
+    langTool.disableRule("TEST_ID");
+    final List<RuleMatch> matches3 = langTool.check("This is a typoh.");
+    assertEquals(1, matches3.size());   // an error again
+  }
+
 }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
Languagetool-cvs mailing list
Languagetool-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs

[LanguageTool] SF.net SVN: languagetool:[7798] trunk/JLanguageTool/src

Reply via email to