Hi,
I think I found a way to speed up the checking of larger amounts of text
by a simple trick that doesn't introduce a lot of complexity. The idea
is that a lot of rules have at least one "simple" token that's not a
regular expression, nor POS tag, nor inflected. If we put all "simple"
tokens of a rule in a set, we can intersect that with all words from a
sentence and see if a rule might match. If it cannot match, we can skip
it.
Using org.languagetool.rules.patterns.PerformanceTest I got these
improvements using 20KB of text:
German: 10 ms/sentence -> 7.7 ms/sentence
French: 15 ms/sentence -> 13 ms/sentence
Let me know if you see a problem with that or if you have an idea how to
make it even faster.
Regards
Daniel
--
http://www.danielnaber.de
Index: languagetool-core/src/main/java/org/languagetool/AnalyzedSentence.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- languagetool-core/src/main/java/org/languagetool/AnalyzedSentence.java (date 1384686689000)
+++ languagetool-core/src/main/java/org/languagetool/AnalyzedSentence.java (revision )
@@ -20,10 +20,7 @@
import org.apache.commons.lang.StringUtils;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
+import java.util.*;
/**
* A sentence that has been tokenized and analyzed.
@@ -35,6 +32,7 @@
private final AnalyzedTokenReadings[] tokens;
private AnalyzedTokenReadings[] nonBlankTokens;
+ private Set<String> tokenSet;
/**
* Array mapping positions of tokens as returned with
@@ -201,6 +199,16 @@
*/
public void setNonBlankTokens(AnalyzedTokenReadings[] nonBlankTokens) {
this.nonBlankTokens = nonBlankTokens;
+ }
+
+ public synchronized Set<String> wordSet() {
+ if (tokenSet == null) {
+ tokenSet = new HashSet<>();
+ for (AnalyzedTokenReadings token : tokens) {
+ tokenSet.add(token.getToken().toLowerCase());
+ }
+ }
+ return tokenSet;
}
@Override
Index: languagetool-core/src/main/java/org/languagetool/JLanguageTool.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- languagetool-core/src/main/java/org/languagetool/JLanguageTool.java (date 1384686689000)
+++ languagetool-core/src/main/java/org/languagetool/JLanguageTool.java (revision )
@@ -634,6 +634,11 @@
throws IOException {
final List<RuleMatch> sentenceMatches = new ArrayList<>();
for (final Rule rule : allRules) {
+
+ if (rule instanceof PatternRule && ((PatternRule)rule).canBeIgnoredFor(analyzedSentence)) {
+ continue;
+ }
+
if (disabledRules.contains(rule.getId())
|| (rule.isDefaultOff() && !enabledRules.contains(rule.getId()))) {
continue;
Index: languagetool-core/src/main/java/org/languagetool/rules/patterns/PatternRule.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- languagetool-core/src/main/java/org/languagetool/rules/patterns/PatternRule.java (date 1384686689000)
+++ languagetool-core/src/main/java/org/languagetool/rules/patterns/PatternRule.java (revision )
@@ -19,8 +19,7 @@
package org.languagetool.rules.patterns;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
+import java.util.*;
import org.languagetool.AnalyzedSentence;
import org.languagetool.Language;
@@ -52,6 +51,8 @@
/** Formatted suggestion elements outside message. **/
private List<Match> suggestionMatchesOutMsg;
+ private Set<String> wordSet;
+
/**
* This property is used for short-circuiting evaluation of the elementNo list
* order.
@@ -217,6 +218,34 @@
*/
public final List<Element> getElements() {
return patternElements;
+ }
+
+ public boolean canBeIgnoredFor(AnalyzedSentence sentence) {
+ Set<String> ruleWords = wordSet();
+ if (ruleWords.isEmpty()) {
+ // one of the rules too complicated for this pre-check
+ return false;
+ }
+ Set<String> intersection = new HashSet<>(ruleWords);
+ intersection.retainAll(sentence.wordSet()); // faster than the other way round
+ if (intersection.size() == 0) {
+ // this rule can never match for the given sentence
+ return true;
+ }
+ return false;
+ }
+
+ private synchronized Set<String> wordSet() {
+ if (wordSet == null) {
+ wordSet = new HashSet<>();
+ for (Element element : patternElements) {
+ if (!element.getNegation() && !element.hasExceptionList() && !element.isRegularExpression()
+ && !element.isReferenceElement() && !element.isInflected()) {
+ wordSet.add(element.getString().toLowerCase());
+ }
+ }
+ }
+ return wordSet;
}
List<Integer> getElementNo() {
------------------------------------------------------------------------------
Shape the Mobile Experience: Free Subscription
Software experts and developers: Be at the forefront of tech innovation.
Intel(R) Software Adrenaline delivers strategic insight and game-changing
conversations that shape the rapidly evolving mobile landscape. Sign up now.
http://pubads.g.doubleclick.net/gampad/clk?id=63431311&iu=/4140/ostg.clktrk
_______________________________________________
Languagetool-devel mailing list
Languagetool-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/languagetool-devel