f30d78df01a853f3fcc3697300641a4c55d5e42e (allow tokens without space
before) seems to broke support for multiple spaces inside the word.
Before if I had a sentence with multiple spaces inside multiword chunk
it'll be recognized, now it does not.
Unfortunately there were no tests to catch that.

I am attaching the test case that illustrates the change.

Andriy
diff --git a/languagetool-standalone/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java b/languagetool-standalone/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java
index 340bd28..7121eb1 100644
--- a/languagetool-standalone/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java
+++ b/languagetool-standalone/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java
@@ -19,10 +19,12 @@
 package org.languagetool.tagging.disambiguation;
 
 import junit.framework.TestCase;
+
 import org.languagetool.AnalyzedSentence;
 import org.languagetool.AnalyzedTokenReadings;
 import org.languagetool.JLanguageTool;
 import org.languagetool.language.English;
+import org.languagetool.language.Ukrainian;
 
 public class MultiWordChunkerTest extends TestCase {
 
@@ -35,4 +37,14 @@
     assertTrue(tokens[4].getReadings().toString().contains("<ELLIPSIS>"));
     assertTrue(tokens[6].getReadings().toString().contains("</ELLIPSIS>"));
   }
+
+  public void testDisambiguateMultiSpace() throws Exception {
+    final Disambiguator chunker = new MultiWordChunker("/uk/multiwords.txt");
+    final JLanguageTool lt = new JLanguageTool(new Ukrainian());
+    final AnalyzedSentence analyzedSentence = lt.getAnalyzedSentence("для  годиться.");
+    final AnalyzedSentence disambiguated = chunker.disambiguate(analyzedSentence);
+    final AnalyzedTokenReadings[] tokens = disambiguated.getTokens();
+    assertTrue(tokens[1].getReadings().toString().contains("<adv>"));
+    assertTrue(tokens[4].getReadings().toString().contains("</adv>"));
+  }
 }
------------------------------------------------------------------------------
New Year. New Location. New Benefits. New Data Center in Ashburn, VA.
GigeNET is offering a free month of service with a new server in Ashburn.
Choose from 2 high performing configs, both with 100TB of bandwidth.
Higher redundancy.Lower latency.Increased capacity.Completely compliant.
http://p.sf.net/sfu/gigenet
_______________________________________________
Languagetool-devel mailing list
Languagetool-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/languagetool-devel

Reply via email to