Revision: 7660
http://languagetool.svn.sourceforge.net/languagetool/?rev=7660&view=rev
Author: milek_pl
Date: 2012-07-07 09:48:09 +0000 (Sat, 07 Jul 2012)
Log Message:
-----------
remove hack and make class safer for different tokenizers that result with
variable-length separators
Modified Paths:
--------------
trunk/JLanguageTool/src/java/org/languagetool/rules/br/MorfologikBretonSpellerRule.java
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
Modified:
trunk/JLanguageTool/src/java/org/languagetool/rules/br/MorfologikBretonSpellerRule.java
===================================================================
---
trunk/JLanguageTool/src/java/org/languagetool/rules/br/MorfologikBretonSpellerRule.java
2012-07-07 09:01:46 UTC (rev 7659)
+++
trunk/JLanguageTool/src/java/org/languagetool/rules/br/MorfologikBretonSpellerRule.java
2012-07-07 09:48:09 UTC (rev 7660)
@@ -51,9 +51,4 @@
return BRETON_TOKENIZING_CHARS;
}
- @Override
- public int separatorLength() {
- return 1;
- }
-
}
Modified:
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
===================================================================
---
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
2012-07-07 09:01:46 UTC (rev 7659)
+++
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/morfologik/MorfologikSpellerRule.java
2012-07-07 09:48:09 UTC (rev 7660)
@@ -26,6 +26,7 @@
import java.util.List;
import java.util.Locale;
import java.util.ResourceBundle;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
import morfologik.speller.Speller;
@@ -73,7 +74,7 @@
@Override
public RuleMatch[] match(AnalyzedSentence text) throws IOException {
-
+
final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
final AnalyzedTokenReadings[] tokens =
text.getTokensWithoutWhitespace();
//lazy init
@@ -93,14 +94,22 @@
if (tokenizingPattern() == null) {
ruleMatches.addAll(getRuleMatch(word,
token.getStartPos()));
} else {
- int i = 0;
- for (final String internalSplit :
tokenizingPattern().split(word)) {
- ruleMatches.addAll(getRuleMatch(internalSplit,
token.getStartPos() + i));
- i += internalSplit.length() + separatorLength();
+ int index = 0;
+ final Matcher m = tokenizingPattern().matcher(word);
+ while(m.find()) {
+ final String match = word.subSequence(index,
m.start()).toString();
+ index = m.end();
+ ruleMatches.addAll(getRuleMatch(match,
token.getStartPos() + index));
}
+ if (index == 0) { // tokenizing char not found
+ ruleMatches.addAll(getRuleMatch(word,
token.getStartPos()));
+ } else {
+ ruleMatches.addAll(getRuleMatch(word.subSequence(
+ index, word.length()).toString(),
token.getStartPos() + index));
+ }
}
}
- }
+ }
return toRuleMatchArray(ruleMatches);
}
@@ -142,13 +151,16 @@
}
return false;
}
-
+
+ /**
+ * Get the regular expression pattern used to tokenize
+ * the words as in the source dictionary. For example,
+ * it may contain a hyphen, if the words with hyphens are
+ * not included in the dictionary
+ * @return A compiled {@link #Pattern} that is used to tokenize words.
+ */
public Pattern tokenizingPattern() {
return null;
}
- public int separatorLength() {
- return 0;
- }
-
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and
threat landscape has changed and how IT managers can respond. Discussions
will include endpoint security, mobile security and the latest in malware
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
Languagetool-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs