morfologik

dnaber Fri, 21 Dec 2012 15:52:52 -0800

Revision: 8622
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=8622&view=rev
Author:   dnaber
Date:     2012-12-21 23:52:31 +0000 (Fri, 21 Dec 2012)
Log Message:
-----------
cleanup: separate morfologik speller and its LT rule matching so it can be used 
separately


Modified Paths:
--------------
    
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java

Added Paths:
-----------
    
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpeller.java

Added: 
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpeller.java
===================================================================
--- 
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpeller.java
                         (rev 0)
+++ 
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpeller.java
 2012-12-21 23:52:31 UTC (rev 8622)
@@ -0,0 +1,88 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2012 Marcin Miłkowski (http://www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package org.languagetool.rules.spelling.morfologik;
+
+import morfologik.speller.Speller;
+import morfologik.stemming.Dictionary;
+import org.languagetool.JLanguageTool;
+import org.languagetool.tools.StringTools;
+
+import java.io.IOException;
+import java.net.URL;
+import java.nio.charset.CharacterCodingException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * Morfologik-based spell checker.
+ */
+public class MorfologikSpeller {
+
+  private final static String LANGUAGETOOL = "LanguageTool";
+
+  private final Speller speller;
+  private final Locale conversionLocale;
+
+  public MorfologikSpeller(String filename, Locale conversionLocale) throws 
IOException {
+    final URL url = 
JLanguageTool.getDataBroker().getFromResourceDirAsUrl(filename);
+    speller = new Speller(Dictionary.read(url));
+    this.conversionLocale = conversionLocale != null ? conversionLocale : 
Locale.getDefault();
+  }
+
+  public MorfologikSpeller(String filename) throws IOException {
+    this(filename, null);
+  }
+
+  public boolean isMisspelled(String word) {
+    boolean isAlphabetic = true;
+    if (word.length() == 1) { // dictionaries usually do not contain 
punctuation
+      isAlphabetic = StringTools.isAlphabetic(word.charAt(0));
+    }
+    return word.length() > 0 && isAlphabetic
+            && !containsDigit(word)
+            && !LANGUAGETOOL.equals(word)
+            && !speller.isInDictionary(word)
+            && !speller.isInDictionary(word.toLowerCase(conversionLocale));
+  }
+
+  public List<String> getSuggestions(String word) {
+    final List<String> suggestions = new ArrayList<String>();
+    try {
+      suggestions.addAll(speller.findReplacements(word));
+      if (!word.toLowerCase(conversionLocale).equals(word)) {
+        
suggestions.addAll(speller.findReplacements(word.toLowerCase(conversionLocale)));
+      }
+      suggestions.addAll(speller.replaceRunOnWords(word));
+    } catch (CharacterCodingException e) {
+      throw new RuntimeException(e);
+    }
+    return suggestions;
+  }
+
+  private boolean containsDigit(final String s) {
+    for (int k = 0; k < s.length(); k++) {
+      if (Character.isDigit(s.charAt(k))) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+}

Modified: 
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
===================================================================
--- 
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
     2012-12-21 23:26:41 UTC (rev 8621)
+++ 
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
     2012-12-21 23:52:31 UTC (rev 8622)
@@ -19,8 +19,15 @@
 
 package org.languagetool.rules.spelling.morfologik;
 
+import org.languagetool.AnalyzedSentence;
+import org.languagetool.AnalyzedTokenReadings;
+import org.languagetool.JLanguageTool;
+import org.languagetool.Language;
+import org.languagetool.rules.Category;
+import org.languagetool.rules.RuleMatch;
+import org.languagetool.rules.spelling.SpellingCheckRule;
+
 import java.io.IOException;
-import java.net.URL;
 import java.nio.charset.CharacterCodingException;
 import java.util.ArrayList;
 import java.util.List;
@@ -29,25 +36,11 @@
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-import morfologik.speller.Speller;
-import morfologik.stemming.Dictionary;
-
-import org.languagetool.AnalyzedSentence;
-import org.languagetool.AnalyzedTokenReadings;
-import org.languagetool.JLanguageTool;
-import org.languagetool.Language;
-import org.languagetool.rules.Category;
-import org.languagetool.rules.RuleMatch;
-import org.languagetool.rules.spelling.SpellingCheckRule;
-import org.languagetool.tools.StringTools;
-
 public abstract class MorfologikSpellerRule extends SpellingCheckRule {
 
-    private final static String LANGUAGETOOL = "LanguageTool";
+    private MorfologikSpeller speller;
+    private Locale conversionLocale;
 
-    private Speller speller;
-    private Locale conversionLocale = Locale.getDefault();
-
     /**
      * Get the filename, e.g., <tt>/resource/pl/spelling.dict</tt>.
      */
@@ -73,14 +66,12 @@
     
     @Override
     public RuleMatch[] match(AnalyzedSentence text) throws IOException {
-
         final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
         final AnalyzedTokenReadings[] tokens = 
text.getTokensWithoutWhitespace();
         //lazy init
-        if (speller == null) {                                   
+        if (speller == null) {
             if (JLanguageTool.getDataBroker().resourceExists(getFileName())) {
-                final URL url = 
JLanguageTool.getDataBroker().getFromResourceDirAsUrl(getFileName());
-                speller = new Speller(Dictionary.read(url));
+                speller = new MorfologikSpeller(getFileName(), 
conversionLocale);
             } else {
                 // should not happen, as we only configure this rule (or 
rather its subclasses)
                 // when we have the resources:
@@ -89,40 +80,38 @@
         }
         for (AnalyzedTokenReadings token : tokens) {
             final String word = token.getToken();
-            if (ignoreWord(word)) {
+            if (ignoreWord(word) || token.isImmunized()) {
                 continue;
             }
-            if (!token.isImmunized()) {
-                if (tokenizingPattern() == null) {
+            if (tokenizingPattern() == null) {
+                ruleMatches.addAll(getRuleMatch(word, token.getStartPos()));
+            } else {
+                int index = 0;
+                final Matcher m = tokenizingPattern().matcher(word);
+                while (m.find()) {
+                    final String match = word.subSequence(index, 
m.start()).toString();
+                    ruleMatches.addAll(getRuleMatch(match, token.getStartPos() 
+ index));
+                    index = m.end();
+                }
+                if (index == 0) { // tokenizing char not found
                     ruleMatches.addAll(getRuleMatch(word, 
token.getStartPos()));
                 } else {
-                    int index = 0;
-                    final Matcher m = tokenizingPattern().matcher(word);
-                    while (m.find()) {
-                        final String match = word.subSequence(index, 
m.start()).toString();                        
-                        ruleMatches.addAll(getRuleMatch(match, 
token.getStartPos() + index));
-                        index = m.end();
-                    }
-                    if (index == 0) { // tokenizing char not found
-                        ruleMatches.addAll(getRuleMatch(word, 
token.getStartPos()));
-                    } else {
-                        ruleMatches.addAll(getRuleMatch(word.subSequence(
-                                index, word.length()).toString(), 
token.getStartPos() + index)); 
-                    }
+                    ruleMatches.addAll(getRuleMatch(word.subSequence(
+                            index, word.length()).toString(), 
token.getStartPos() + index));
                 }
             }
         }
         return toRuleMatchArray(ruleMatches);
     }
-    
+
     private List<RuleMatch> getRuleMatch(final String word, final int 
startPos) throws CharacterCodingException {
         final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
-        if (isMisspelled(word)) {
+        if (speller.isMisspelled(word)) {
             final RuleMatch ruleMatch = new RuleMatch(this,
                     startPos, startPos + word.length(),
                     messages.getString("spelling"),
                     messages.getString("desc_spelling_short"));
-            final List<String> suggestions = getSuggestions(word);
+            final List<String> suggestions = speller.getSuggestions(word);
             if (!suggestions.isEmpty()) {
                 ruleMatch.setSuggestedReplacements(suggestions);
             }
@@ -131,37 +120,6 @@
         return ruleMatches;
     }
 
-    private boolean isMisspelled(String word) {
-        boolean isAlphabetic = true;
-        if (word.length() == 1) { // dictionaries usually do not contain 
punctuation
-            isAlphabetic = StringTools.isAlphabetic(word.charAt(0));
-        }
-        return word.length() > 0 && isAlphabetic
-                && !containsDigit(word)
-                && !LANGUAGETOOL.equals(word)
-                && !speller.isInDictionary(word)
-                && !speller.isInDictionary(word.toLowerCase(conversionLocale));
-    }
-
-    private boolean containsDigit(final String s) {
-        for (int k = 0; k < s.length(); k++) {
-            if (Character.isDigit(s.charAt(k))) {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    private List<String> getSuggestions(String word) throws 
CharacterCodingException {
-        final List<String> suggestions = new ArrayList<String>();
-        suggestions.addAll(speller.findReplacements(word));
-        if (!word.toLowerCase(conversionLocale).equals(word)) {
-            
suggestions.addAll(speller.findReplacements(word.toLowerCase(conversionLocale)));
-        }
-        suggestions.addAll(speller.replaceRunOnWords(word));
-        return suggestions;
-    }
-
     /**
      * Get the regular expression pattern used to tokenize
      * the words as in the source dictionary. For example,

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
LogMeIn Rescue: Anywhere, Anytime Remote support for IT. Free Trial
Remotely access PCs and mobile devices and provide instant support
Improve your efficiency, and focus on delivering more value-add services
Discover what IT Professionals Know. Rescue delivers
http://p.sf.net/sfu/logmein_12329d2d
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits

SF.net SVN: languagetool:[8622] trunk/JLanguageTool/src/main/java/org/ languagetool/rules/spelling/morfologik

Reply via email to