f30d78df01a853f3fcc3697300641a4c55d5e42e (allow tokens without space
before) seems to broke support for multiple spaces inside the word.
Before if I had a sentence with multiple spaces inside multiword chunk
it'll be recognized, now it does not.
Unfortunately there were no tests to catch that.
I am attaching the test case that illustrates the change.
Andriy
diff --git a/languagetool-standalone/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java b/languagetool-standalone/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java
index 340bd28..7121eb1 100644
--- a/languagetool-standalone/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java
+++ b/languagetool-standalone/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java
@@ -19,10 +19,12 @@
package org.languagetool.tagging.disambiguation;
import junit.framework.TestCase;
+
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.language.English;
+import org.languagetool.language.Ukrainian;
public class MultiWordChunkerTest extends TestCase {
@@ -35,4 +37,14 @@
assertTrue(tokens[4].getReadings().toString().contains("<ELLIPSIS>"));
assertTrue(tokens[6].getReadings().toString().contains("</ELLIPSIS>"));
}
+
+ public void testDisambiguateMultiSpace() throws Exception {
+ final Disambiguator chunker = new MultiWordChunker("/uk/multiwords.txt");
+ final JLanguageTool lt = new JLanguageTool(new Ukrainian());
+ final AnalyzedSentence analyzedSentence = lt.getAnalyzedSentence("для годиться.");
+ final AnalyzedSentence disambiguated = chunker.disambiguate(analyzedSentence);
+ final AnalyzedTokenReadings[] tokens = disambiguated.getTokens();
+ assertTrue(tokens[1].getReadings().toString().contains("<adv>"));
+ assertTrue(tokens[4].getReadings().toString().contains("</adv>"));
+ }
}
------------------------------------------------------------------------------
New Year. New Location. New Benefits. New Data Center in Ashburn, VA.
GigeNET is offering a free month of service with a new server in Ashburn.
Choose from 2 high performing configs, both with 100TB of bandwidth.
Higher redundancy.Lower latency.Increased capacity.Completely compliant.
http://p.sf.net/sfu/gigenet
_______________________________________________
Languagetool-devel mailing list
Languagetool-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/languagetool-devel