Revision: 8622
http://languagetool.svn.sourceforge.net/languagetool/?rev=8622&view=rev
Author: dnaber
Date: 2012-12-21 23:52:31 +0000 (Fri, 21 Dec 2012)
Log Message:
-----------
cleanup: separate morfologik speller and its LT rule matching so it can be used
separately
Modified Paths:
--------------
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
Added Paths:
-----------
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpeller.java
Added:
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpeller.java
===================================================================
---
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpeller.java
(rev 0)
+++
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpeller.java
2012-12-21 23:52:31 UTC (rev 8622)
@@ -0,0 +1,88 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2012 Marcin MiĆkowski (http://www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package org.languagetool.rules.spelling.morfologik;
+
+import morfologik.speller.Speller;
+import morfologik.stemming.Dictionary;
+import org.languagetool.JLanguageTool;
+import org.languagetool.tools.StringTools;
+
+import java.io.IOException;
+import java.net.URL;
+import java.nio.charset.CharacterCodingException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * Morfologik-based spell checker.
+ */
+public class MorfologikSpeller {
+
+ private final static String LANGUAGETOOL = "LanguageTool";
+
+ private final Speller speller;
+ private final Locale conversionLocale;
+
+ public MorfologikSpeller(String filename, Locale conversionLocale) throws
IOException {
+ final URL url =
JLanguageTool.getDataBroker().getFromResourceDirAsUrl(filename);
+ speller = new Speller(Dictionary.read(url));
+ this.conversionLocale = conversionLocale != null ? conversionLocale :
Locale.getDefault();
+ }
+
+ public MorfologikSpeller(String filename) throws IOException {
+ this(filename, null);
+ }
+
+ public boolean isMisspelled(String word) {
+ boolean isAlphabetic = true;
+ if (word.length() == 1) { // dictionaries usually do not contain
punctuation
+ isAlphabetic = StringTools.isAlphabetic(word.charAt(0));
+ }
+ return word.length() > 0 && isAlphabetic
+ && !containsDigit(word)
+ && !LANGUAGETOOL.equals(word)
+ && !speller.isInDictionary(word)
+ && !speller.isInDictionary(word.toLowerCase(conversionLocale));
+ }
+
+ public List<String> getSuggestions(String word) {
+ final List<String> suggestions = new ArrayList<String>();
+ try {
+ suggestions.addAll(speller.findReplacements(word));
+ if (!word.toLowerCase(conversionLocale).equals(word)) {
+
suggestions.addAll(speller.findReplacements(word.toLowerCase(conversionLocale)));
+ }
+ suggestions.addAll(speller.replaceRunOnWords(word));
+ } catch (CharacterCodingException e) {
+ throw new RuntimeException(e);
+ }
+ return suggestions;
+ }
+
+ private boolean containsDigit(final String s) {
+ for (int k = 0; k < s.length(); k++) {
+ if (Character.isDigit(s.charAt(k))) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+}
Modified:
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
===================================================================
---
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
2012-12-21 23:26:41 UTC (rev 8621)
+++
trunk/JLanguageTool/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
2012-12-21 23:52:31 UTC (rev 8622)
@@ -19,8 +19,15 @@
package org.languagetool.rules.spelling.morfologik;
+import org.languagetool.AnalyzedSentence;
+import org.languagetool.AnalyzedTokenReadings;
+import org.languagetool.JLanguageTool;
+import org.languagetool.Language;
+import org.languagetool.rules.Category;
+import org.languagetool.rules.RuleMatch;
+import org.languagetool.rules.spelling.SpellingCheckRule;
+
import java.io.IOException;
-import java.net.URL;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.List;
@@ -29,25 +36,11 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import morfologik.speller.Speller;
-import morfologik.stemming.Dictionary;
-
-import org.languagetool.AnalyzedSentence;
-import org.languagetool.AnalyzedTokenReadings;
-import org.languagetool.JLanguageTool;
-import org.languagetool.Language;
-import org.languagetool.rules.Category;
-import org.languagetool.rules.RuleMatch;
-import org.languagetool.rules.spelling.SpellingCheckRule;
-import org.languagetool.tools.StringTools;
-
public abstract class MorfologikSpellerRule extends SpellingCheckRule {
- private final static String LANGUAGETOOL = "LanguageTool";
+ private MorfologikSpeller speller;
+ private Locale conversionLocale;
- private Speller speller;
- private Locale conversionLocale = Locale.getDefault();
-
/**
* Get the filename, e.g., <tt>/resource/pl/spelling.dict</tt>.
*/
@@ -73,14 +66,12 @@
@Override
public RuleMatch[] match(AnalyzedSentence text) throws IOException {
-
final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
final AnalyzedTokenReadings[] tokens =
text.getTokensWithoutWhitespace();
//lazy init
- if (speller == null) {
+ if (speller == null) {
if (JLanguageTool.getDataBroker().resourceExists(getFileName())) {
- final URL url =
JLanguageTool.getDataBroker().getFromResourceDirAsUrl(getFileName());
- speller = new Speller(Dictionary.read(url));
+ speller = new MorfologikSpeller(getFileName(),
conversionLocale);
} else {
// should not happen, as we only configure this rule (or
rather its subclasses)
// when we have the resources:
@@ -89,40 +80,38 @@
}
for (AnalyzedTokenReadings token : tokens) {
final String word = token.getToken();
- if (ignoreWord(word)) {
+ if (ignoreWord(word) || token.isImmunized()) {
continue;
}
- if (!token.isImmunized()) {
- if (tokenizingPattern() == null) {
+ if (tokenizingPattern() == null) {
+ ruleMatches.addAll(getRuleMatch(word, token.getStartPos()));
+ } else {
+ int index = 0;
+ final Matcher m = tokenizingPattern().matcher(word);
+ while (m.find()) {
+ final String match = word.subSequence(index,
m.start()).toString();
+ ruleMatches.addAll(getRuleMatch(match, token.getStartPos()
+ index));
+ index = m.end();
+ }
+ if (index == 0) { // tokenizing char not found
ruleMatches.addAll(getRuleMatch(word,
token.getStartPos()));
} else {
- int index = 0;
- final Matcher m = tokenizingPattern().matcher(word);
- while (m.find()) {
- final String match = word.subSequence(index,
m.start()).toString();
- ruleMatches.addAll(getRuleMatch(match,
token.getStartPos() + index));
- index = m.end();
- }
- if (index == 0) { // tokenizing char not found
- ruleMatches.addAll(getRuleMatch(word,
token.getStartPos()));
- } else {
- ruleMatches.addAll(getRuleMatch(word.subSequence(
- index, word.length()).toString(),
token.getStartPos() + index));
- }
+ ruleMatches.addAll(getRuleMatch(word.subSequence(
+ index, word.length()).toString(),
token.getStartPos() + index));
}
}
}
return toRuleMatchArray(ruleMatches);
}
-
+
private List<RuleMatch> getRuleMatch(final String word, final int
startPos) throws CharacterCodingException {
final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
- if (isMisspelled(word)) {
+ if (speller.isMisspelled(word)) {
final RuleMatch ruleMatch = new RuleMatch(this,
startPos, startPos + word.length(),
messages.getString("spelling"),
messages.getString("desc_spelling_short"));
- final List<String> suggestions = getSuggestions(word);
+ final List<String> suggestions = speller.getSuggestions(word);
if (!suggestions.isEmpty()) {
ruleMatch.setSuggestedReplacements(suggestions);
}
@@ -131,37 +120,6 @@
return ruleMatches;
}
- private boolean isMisspelled(String word) {
- boolean isAlphabetic = true;
- if (word.length() == 1) { // dictionaries usually do not contain
punctuation
- isAlphabetic = StringTools.isAlphabetic(word.charAt(0));
- }
- return word.length() > 0 && isAlphabetic
- && !containsDigit(word)
- && !LANGUAGETOOL.equals(word)
- && !speller.isInDictionary(word)
- && !speller.isInDictionary(word.toLowerCase(conversionLocale));
- }
-
- private boolean containsDigit(final String s) {
- for (int k = 0; k < s.length(); k++) {
- if (Character.isDigit(s.charAt(k))) {
- return true;
- }
- }
- return false;
- }
-
- private List<String> getSuggestions(String word) throws
CharacterCodingException {
- final List<String> suggestions = new ArrayList<String>();
- suggestions.addAll(speller.findReplacements(word));
- if (!word.toLowerCase(conversionLocale).equals(word)) {
-
suggestions.addAll(speller.findReplacements(word.toLowerCase(conversionLocale)));
- }
- suggestions.addAll(speller.replaceRunOnWords(word));
- return suggestions;
- }
-
/**
* Get the regular expression pattern used to tokenize
* the words as in the source dictionary. For example,
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
LogMeIn Rescue: Anywhere, Anytime Remote support for IT. Free Trial
Remotely access PCs and mobile devices and provide instant support
Improve your efficiency, and focus on delivering more value-add services
Discover what IT Professionals Know. Rescue delivers
http://p.sf.net/sfu/logmein_12329d2d
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits