Revision: 7798 http://languagetool.svn.sourceforge.net/languagetool/?rev=7798&view=rev Author: dnaber Date: 2012-08-05 19:12:30 +0000 (Sun, 05 Aug 2012) Log Message: ----------- when adding rules programmatically, extract their suggestions to ignore them for spell check; extending extractor with a main method that extracts all ignore tokens for all languages
Modified Paths: -------------- trunk/JLanguageTool/src/java/org/languagetool/JLanguageTool.java trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SpellingCheckRule.java trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SuggestionExtractor.java trunk/JLanguageTool/src/test/org/languagetool/rules/spelling/SpellingCheckRuleTest.java Modified: trunk/JLanguageTool/src/java/org/languagetool/JLanguageTool.java =================================================================== --- trunk/JLanguageTool/src/java/org/languagetool/JLanguageTool.java 2012-08-05 18:05:43 UTC (rev 7797) +++ trunk/JLanguageTool/src/java/org/languagetool/JLanguageTool.java 2012-08-05 19:12:30 UTC (rev 7798) @@ -50,6 +50,8 @@ import org.languagetool.rules.patterns.FalseFriendRuleLoader; import org.languagetool.rules.patterns.PatternRule; import org.languagetool.rules.patterns.PatternRuleLoader; +import org.languagetool.rules.spelling.SpellingCheckRule; +import org.languagetool.rules.spelling.SuggestionExtractor; import org.languagetool.tagging.Tagger; import org.languagetool.tagging.disambiguation.Disambiguator; import org.languagetool.tokenizers.Tokenizer; @@ -388,28 +390,65 @@ */ public void addRule(final Rule rule) { userRules.add(rule); + // TODO: not on first start - too slow: + final SuggestionExtractor extractor = new SuggestionExtractor(language); + final List<String> suggestionTokens = extractor.getSuggestionTokens(rule); + final List<Rule> allActiveRules = getAllActiveRules(); + addIgnoreWords(suggestionTokens, allActiveRules); } + private void addIgnoreWords(List<String> suggestionTokens, List<Rule> allActiveRules) { + for (Rule activeRule : allActiveRules) { + if (activeRule instanceof SpellingCheckRule) { + ((SpellingCheckRule)activeRule).addIgnoreTokens(suggestionTokens); + } + } + } + + private void setIgnoreWords(List<String> suggestionTokens, List<Rule> allActiveRules) { + for (Rule activeRule : allActiveRules) { + if (activeRule instanceof SpellingCheckRule) { + ((SpellingCheckRule)activeRule).resetIgnoreTokens(); + ((SpellingCheckRule)activeRule).addIgnoreTokens(suggestionTokens); + } + } + } + /** * Disable a given rule so {@link #check(String)} won't use it. * - * @param ruleId - * the id of the rule to disable + * @param ruleId the id of the rule to disable - no error will be given if the id does not exist */ public void disableRule(final String ruleId) { - // TODO: check if such a rule exists disabledRules.add(ruleId); + reInitSpellCheckIgnoreWords(); } + private void reInitSpellCheckIgnoreWords() { + final List<Rule> allActiveRules = getAllActiveRules(); + final List<String> ignoreTokens = getAllIgnoreWords(allActiveRules); + setIgnoreWords(ignoreTokens, allActiveRules); + } + + private List<String> getAllIgnoreWords(List<Rule> allActiveRules) { + final List<String> suggestionTokens = new ArrayList<String>(); + for (Rule activeRule : allActiveRules) { + if (activeRule instanceof PatternRule) { + final SuggestionExtractor extractor = new SuggestionExtractor(language); + suggestionTokens.addAll(extractor.getSuggestionTokens(activeRule)); + } + } + return suggestionTokens; + } + /** * Disable a given category so {@link #check(String)} won't use it. * - * @param categoryName - * the id of the category to disable + * @param categoryName the id of the category to disable - no error will be given if the id does not exist */ public void disableCategory(final String categoryName) { - // TODO: check if such a rule exists disabledCategories.add(categoryName); + reInitSpellCheckIgnoreWords(); } /** Modified: trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SpellingCheckRule.java =================================================================== --- trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SpellingCheckRule.java 2012-08-05 18:05:43 UTC (rev 7797) +++ trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SpellingCheckRule.java 2012-08-05 19:12:30 UTC (rev 7798) @@ -63,6 +63,25 @@ public void reset() { } + /** + * Add the given words to the list of words to be ignored during spell check. + */ + public void addIgnoreTokens(List<String> tokens) { + wordsToBeIgnored.addAll(tokens); + } + + /** + * Reset the list of words to be ignored, by re-loading it from the "ignore.txt" file. + */ + public void resetIgnoreTokens() { + wordsToBeIgnored.clear(); + try { + init(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + protected boolean ignoreWord(String word) throws IOException { // TODO?: this is needed at least for German as Hunspell tokenization includes the dot: final String cleanWord = word.endsWith(".") ? word.substring(0, word.length() - 1) : word; @@ -71,7 +90,6 @@ protected void init() throws IOException { loadFileIfExists(language.getShortName() + SPELLING_IGNORE_FILE); - loadFileIfExists(language.getShortNameWithVariant() + SPELLING_IGNORE_FILE); } private void loadFileIfExists(String filename) throws IOException { Modified: trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SuggestionExtractor.java =================================================================== --- trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SuggestionExtractor.java 2012-08-05 18:05:43 UTC (rev 7797) +++ trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/SuggestionExtractor.java 2012-08-05 19:12:30 UTC (rev 7798) @@ -18,19 +18,22 @@ */ package org.languagetool.rules.spelling; +import org.languagetool.JLanguageTool; import org.languagetool.Language; import org.languagetool.rules.Rule; import org.languagetool.rules.patterns.PatternRule; -import java.util.ArrayList; -import java.util.List; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Extract tokens from suggestions. */ -class SuggestionExtractor { +public class SuggestionExtractor { private final static Pattern SUGGESTION_PATTERN = Pattern.compile("<suggestion.*?>(.*?)</suggestion>"); private final static Pattern BACK_REFERENCE_PATTERN = Pattern.compile("\\\\" + "\\d+"); @@ -43,7 +46,7 @@ /** * Get the tokens of simple suggestions, i.e. those that don't use back references. */ - List<String> getSuggestionTokens(Rule rule) { + public List<String> getSuggestionTokens(Rule rule) { final List<String> wordsToBeIgnored = new ArrayList<String>(); if (rule instanceof PatternRule) { final PatternRule patternRule = (PatternRule) rule; @@ -86,9 +89,94 @@ final List<String> tokens = new ArrayList<String>(); for (String suggestion : suggestions) { final List<String> suggestionTokens = language.getWordTokenizer().tokenize(suggestion); - tokens.addAll(suggestionTokens); + for (String suggestionToken : suggestionTokens) { + if (!suggestionToken.trim().isEmpty()) { + tokens.add(suggestionToken); + } + } } return tokens; } + private void writeIgnoreTokensForLanguages() throws IOException { + final Map<Language, Set<String>> map = getLanguageToIgnoreTokensMapping(); + for (Map.Entry<Language, Set<String>> entry : map.entrySet()) { + final Language language = entry.getKey(); + final File langDir = getLanguageDir(language); + final File hunspellDir = new File(langDir, "hunspell"); + if (!hunspellDir.exists()) { + System.out.println("No directory " + hunspellDir + " found, ignoring language " + language); + continue; + } + final File ignoreFile = new File(hunspellDir, "ignore.txt"); + final Set<String> tokens = entry.getValue(); + if (tokens.size() > 0) { + final FileWriter writer = new FileWriter(ignoreFile); + try { + writeIntro(writer, language); + for (String token : tokens) { + writer.write(token); + writer.write("\n"); + } + } finally { + writer.close(); + } + System.out.println("Wrote " + tokens.size() + " words to " + ignoreFile); + } + } + } + + private void writeIntro(FileWriter writer, Language language) throws IOException { + writer.write("# words to be ignored by the spellchecker (auto-generated " + new Date() + ")\n"); + writeArtificialTestCaseItems(writer, language); + } + + private void writeArtificialTestCaseItems(FileWriter writer, Language language) throws IOException { + if (language == Language.AMERICAN_ENGLISH) { + writer.write("anArtificialTestWordForLanguageTool\n"); + } else if (language == Language.GERMANY_GERMAN) { + writer.write("einPseudoWortFürLanguageToolTests\n"); + } + } + + /** + * We don't support sub-language resources yet, so collect all variants for one language. + */ + private Map<Language, Set<String>> getLanguageToIgnoreTokensMapping() throws IOException { + final Map<Language, Set<String>> langToIgnoreTokens = new HashMap<Language, Set<String>>(); + for (Language lang : Language.REAL_LANGUAGES) { + final Set<String> suggestionTokens = new HashSet<String>(); + final JLanguageTool languageTool = new JLanguageTool(lang); + languageTool.activateDefaultPatternRules(); + final List<Rule> rules = languageTool.getAllRules(); + for (Rule rule : rules) { + suggestionTokens.addAll(getSuggestionTokens(rule)); + } + final Language noVariantLanguage = lang.getDefaultVariant() == null ? lang : lang.getDefaultVariant(); + final Set<String> existingTokens = langToIgnoreTokens.get(noVariantLanguage); + if (existingTokens != null) { + existingTokens.addAll(suggestionTokens); + } else { + langToIgnoreTokens.put(noVariantLanguage, suggestionTokens); + } + } + return langToIgnoreTokens; + } + + private File getLanguageDir(Language language) { + final File dir = new File("resource", language.getShortName()); + if (dir.exists()) { + return dir; + } else { + // during development (in SVN): + final File sourceDir = new File("src", "resource"); + return new File(sourceDir, language.getShortName()); + } + } + + public static void main(String[] args) throws IOException { + final SuggestionExtractor extractor = new SuggestionExtractor(/*not used:*/Language.ENGLISH); + extractor.writeIgnoreTokensForLanguages(); + } + } Modified: trunk/JLanguageTool/src/test/org/languagetool/rules/spelling/SpellingCheckRuleTest.java =================================================================== --- trunk/JLanguageTool/src/test/org/languagetool/rules/spelling/SpellingCheckRuleTest.java 2012-08-05 18:05:43 UTC (rev 7797) +++ trunk/JLanguageTool/src/test/org/languagetool/rules/spelling/SpellingCheckRuleTest.java 2012-08-05 19:12:30 UTC (rev 7798) @@ -21,9 +21,15 @@ import junit.framework.TestCase; import org.languagetool.JLanguageTool; import org.languagetool.Language; +import org.languagetool.TestTools; import org.languagetool.rules.RuleMatch; +import org.languagetool.rules.en.MorfologikAmericanSpellerRule; +import org.languagetool.rules.patterns.Element; +import org.languagetool.rules.patterns.PatternRule; +import org.languagetool.rules.spelling.hunspell.HunspellNoSuggestionRule; import java.io.IOException; +import java.util.Collections; import java.util.List; public class SpellingCheckRuleTest extends TestCase { @@ -50,4 +56,44 @@ assertEquals("MORFOLOGIK_RULE_EN_US", matches2.get(0).getRule().getId()); } + public void testIgnoreSuggestionsWithDynamicHunspellRule() throws IOException { + final JLanguageTool langTool = new JLanguageTool(Language.GERMANY_GERMAN); + final SpellingCheckRule rule = new HunspellNoSuggestionRule(TestTools.getEnglishMessages(), Language.GERMANY_GERMAN); + langTool.addRule(rule); + final List<RuleMatch> matches = langTool.check("Das ist ein Tibbfehla."); + assertEquals(1, matches.size()); + assertEquals(HunspellNoSuggestionRule.RULE_ID, matches.get(0).getRule().getId()); + + final PatternRule ruleWithSuggestion = new PatternRule("TEST_ID", Language.GERMANY_GERMAN, + Collections.<Element>emptyList(), "description", + "Meinten Sie <suggestion>Tibbfehla</suggestion>?", null); + langTool.addRule(ruleWithSuggestion); + final List<RuleMatch> matches2 = langTool.check("Das ist ein Tibbfehla."); + assertEquals(0, matches2.size()); // no error anymore, as this is a suggestion + + langTool.disableRule("TEST_ID"); + final List<RuleMatch> matches3 = langTool.check("Das ist ein Tibbfehla."); + assertEquals(1, matches3.size()); // an error again + } + + public void testIgnoreSuggestionsWithDynamicMorfologikRule() throws IOException { + final JLanguageTool langTool = new JLanguageTool(Language.AMERICAN_ENGLISH); + final SpellingCheckRule rule = new MorfologikAmericanSpellerRule(TestTools.getEnglishMessages(), Language.AMERICAN_ENGLISH); + langTool.addRule(rule); + final List<RuleMatch> matches = langTool.check("This is a typoh."); + assertEquals(1, matches.size()); + assertEquals(MorfologikAmericanSpellerRule.RULE_ID, matches.get(0).getRule().getId()); + + final PatternRule ruleWithSuggestion = new PatternRule("TEST_ID", Language.AMERICAN_ENGLISH, + Collections.<Element>emptyList(), "description", + "Did you mean <suggestion>typoh</suggestion>?", null); + langTool.addRule(ruleWithSuggestion); + final List<RuleMatch> matches2 = langTool.check("This is a typoh."); + assertEquals(0, matches2.size()); // no error anymore, as this is a suggestion + + langTool.disableRule("TEST_ID"); + final List<RuleMatch> matches3 = langTool.check("This is a typoh."); + assertEquals(1, matches3.size()); // an error again + } + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Live Security Virtual Conference Exclusive live event will cover all the ways today's security and threat landscape has changed and how IT managers can respond. Discussions will include endpoint security, mobile security and the latest in malware threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/ _______________________________________________ Languagetool-cvs mailing list Languagetool-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/languagetool-cvs