cassandra git commit: Correct english word stemming test and add a test for french

xedin Sun, 26 Jun 2016 01:53:27 -0700

Repository: cassandra
Updated Branches:
  refs/heads/trunk f1cabcade -> eb82861c8



Correct english word stemming test and add a test for french

patch by doanduyhai; reviewed by xedin for CASSANDRA-12078


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/eb82861c
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/eb82861c
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/eb82861c

Branch: refs/heads/trunk
Commit: eb82861c8d4c497d64b5e61a1606bdd270e8e109
Parents: f1cabca
Author: Pavel Yaskevich <[email protected]>
Authored: Sun Jun 26 01:48:23 2016 -0700
Committer: Pavel Yaskevich <[email protected]>
Committed: Sun Jun 26 01:50:32 2016 -0700

----------------------------------------------------------------------
 .../sasi/analyzer/filter/StemmingFilters.java   |  2 +-
 .../french_skip_stop_words_before_stemming.txt  |  1 +
 .../sasi/analyzer/StandardAnalyzerTest.java     | 33 +++++++++++++++++++-
 3 files changed, 34 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/eb82861c/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java
----------------------------------------------------------------------
diff --git 
a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java 
b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java
index 9e098d1..cb840a8 100644
--- 
a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java
+++ 
b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java
@@ -37,7 +37,7 @@ public class StemmingFilters
 
         public String process(String input) throws Exception
         {
-            if (stemmer == null)
+            if (input == null || stemmer == null)
                 return input;
             stemmer.setCurrent(input);
             return (stemmer.stem()) ? stemmer.getCurrent() : input;

http://git-wip-us.apache.org/repos/asf/cassandra/blob/eb82861c/test/resources/tokenization/french_skip_stop_words_before_stemming.txt
----------------------------------------------------------------------
diff --git 
a/test/resources/tokenization/french_skip_stop_words_before_stemming.txt 
b/test/resources/tokenization/french_skip_stop_words_before_stemming.txt
new file mode 100644
index 0000000..59a1c23
--- /dev/null
+++ b/test/resources/tokenization/french_skip_stop_words_before_stemming.txt
@@ -0,0 +1 @@
+"La danse sous la pluie" est une chanson connue
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/cassandra/blob/eb82861c/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java
----------------------------------------------------------------------
diff --git 
a/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java 
b/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java
index e307512..7a88a3d 100644
--- 
a/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java
+++ 
b/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java
@@ -25,6 +25,8 @@ import java.util.Locale;
 
 import org.junit.Test;
 
+import org.apache.cassandra.serializers.UTF8Serializer;
+
 import static org.junit.Assert.assertEquals;
 
 public class StandardAnalyzerTest
@@ -151,7 +153,36 @@ public class StandardAnalyzerTest
         while (tokenizer.hasNext())
             tokens.add(tokenizer.next());
 
-        assertEquals(40249, tokens.size());
+        assertEquals(37739, tokens.size());
+    }
+
+    @Test
+    public void testSkipStopWordBeforeStemmingFrench() throws Exception
+    {
+        InputStream is = StandardAnalyzerTest.class.getClassLoader()
+               
.getResourceAsStream("tokenization/french_skip_stop_words_before_stemming.txt");
+
+        StandardTokenizerOptions options = new 
StandardTokenizerOptions.OptionsBuilder().stemTerms(true)
+                .ignoreStopTerms(true).useLocale(Locale.FRENCH)
+                .alwaysLowerCaseTerms(true).build();
+        StandardAnalyzer tokenizer = new StandardAnalyzer();
+        tokenizer.init(options);
+
+        List<ByteBuffer> tokens = new ArrayList<>();
+        List<String> words = new ArrayList<>();
+        tokenizer.reset(is);
+        while (tokenizer.hasNext())
+        {
+            final ByteBuffer nextToken = tokenizer.next();
+            tokens.add(nextToken);
+            
words.add(UTF8Serializer.instance.deserialize(nextToken.duplicate()));
+        }
+
+        assertEquals(4, tokens.size());
+        assertEquals("dans", words.get(0));
+        assertEquals("plui", words.get(1));
+        assertEquals("chanson", words.get(2));
+        assertEquals("connu", words.get(3));
     }
 
     @Test

cassandra git commit: Correct english word stemming test and add a test for french

Reply via email to