Here's a PoC patch to allow ignoring arbitrary characters when analyzing text.
Currently we do ignore soft hyphens but some languages will need to
ignore other chars as well. E.g. in Ukrainian it's very useful to
ignore accent (U+0301) so I extended current logic about soft hyphens
so that each language can set their own set of ignored characters. It
actually sets the regex so you ignore any combination of characters
too if you wish.
There are two things to note:
1) when searching for ignored chars the regex match will be slower
than indexOf(); most of the texts (as I expect) won't have ignored
chars so there may be some slowdown. This code is before we split into
threads so the bottleneck gets narrower.
We could change regex into array of chars and use for loop/indexOf
(this should still be faster than regexp for several characters).
2) I could not understand the existing logic of adding the original
token (with ignored chars) with null tag to the readings in
getRawAnalyzedSentence(), this logic broke some of my rules (that's
why I put if( false ) ), but if this is really needed I can adjust my
rules to handle that.
If this looks good and we agree of the final logic I'll clean up the
patch (rename the method replaceSoftHyphens(), remove unnecessary
code, add javadocs) and will push it in.
Thanks,
Andriy
diff --git a/languagetool-core/src/main/java/org/languagetool/JLanguageTool.java b/languagetool-core/src/main/java/org/languagetool/JLanguageTool.java
index c826a38..75d40d1 100644
--- a/languagetool-core/src/main/java/org/languagetool/JLanguageTool.java
+++ b/languagetool-core/src/main/java/org/languagetool/JLanguageTool.java
@@ -31,6 +31,7 @@
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
+
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@@ -40,6 +41,7 @@
import java.util.*;
import java.util.concurrent.Callable;
import java.util.jar.Manifest;
+import java.util.regex.Pattern;
/**
* The main class used for checking text against different rules:
@@ -699,6 +701,7 @@
aTokens.get(i).setStartPos(aTokens.get(i).getStartPos() + posFix);
if (!softHyphenTokens.isEmpty()) {
if (softHyphenTokens.get(i) != null) {
+ if( false ) // AR: why are we adding a reading with null tag?
aTokens.get(i).addReading(language.getTagger().createToken(softHyphenTokens.get(i), null));
posFix += softHyphenTokens.get(i).length() - aTokens.get(i).getToken().length();
}
@@ -737,11 +740,16 @@
}
private Map<Integer, String> replaceSoftHyphens(List<String> tokens) {
+ Pattern ignoredCharacterRegex = language.getIgnoredCharactersRegex();
+
final Map<Integer, String> softHyphenTokens = new HashMap<>();
+ if( ignoredCharacterRegex == null )
+ return softHyphenTokens;
+
for (int i = 0; i < tokens.size(); i++) {
- if (tokens.get(i).indexOf('\u00ad') != -1) {
+ if ( ignoredCharacterRegex.matcher(tokens.get(i)).find() ) {
softHyphenTokens.put(i, tokens.get(i));
- tokens.set(i, tokens.get(i).replaceAll("\u00ad", ""));
+ tokens.set(i, ignoredCharacterRegex.matcher(tokens.get(i)).replaceAll(""));
}
}
return softHyphenTokens;
diff --git a/languagetool-core/src/main/java/org/languagetool/Language.java b/languagetool-core/src/main/java/org/languagetool/Language.java
index 0c0292a..dcc690f 100644
--- a/languagetool-core/src/main/java/org/languagetool/Language.java
+++ b/languagetool-core/src/main/java/org/languagetool/Language.java
@@ -43,6 +43,7 @@
import java.lang.reflect.Constructor;
import java.net.URL;
import java.util.*;
+import java.util.regex.Pattern;
/**
* Base class for any supported language (English, German, etc). Language classes
@@ -58,6 +59,7 @@
private static final String PROPERTIES_KEY = "languageClasses";
private static List<Language> externalLanguages = new ArrayList<>();
+ private Pattern ignoredCharactersRegex = Pattern.compile("[\u00AD]");
private boolean isExternalLanguage = false;
@@ -716,4 +718,12 @@
return getCountries().length == 1;
}
+ public Pattern getIgnoredCharactersRegex() {
+ return ignoredCharactersRegex;
+ }
+
+ public void setIgnoredCharacters(Pattern ignoredCharacters) {
+ this.ignoredCharactersRegex = ignoredCharacters;
+ }
+
}
diff --git a/languagetool-language-modules/uk/src/main/java/org/languagetool/language/Ukrainian.java b/languagetool-language-modules/uk/src/main/java/org/languagetool/language/Ukrainian.java
index 91fd2b1..ae04e67 100644
--- a/languagetool-language-modules/uk/src/main/java/org/languagetool/language/Ukrainian.java
+++ b/languagetool-language-modules/uk/src/main/java/org/languagetool/language/Ukrainian.java
@@ -23,6 +23,7 @@
import java.util.List;
import java.util.Locale;
import java.util.ResourceBundle;
+import java.util.regex.Pattern;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
@@ -62,6 +63,10 @@
private Disambiguator disambiguator;
private String name = "Ukrainian";
+ public Ukrainian() {
+ setIgnoredCharacters(Pattern.compile("[\u00A0\u0301]"));
+ }
+
@Override
public Locale getLocale() {
return new Locale(getShortName());
------------------------------------------------------------------------------
New Year. New Location. New Benefits. New Data Center in Ashburn, VA.
GigeNET is offering a free month of service with a new server in Ashburn.
Choose from 2 high performing configs, both with 100TB of bandwidth.
Higher redundancy.Lower latency.Increased capacity.Completely compliant.
http://p.sf.net/sfu/gigenet
_______________________________________________
Languagetool-devel mailing list
Languagetool-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/languagetool-devel