Repository: cassandra Updated Branches: refs/heads/trunk f1cabcade -> eb82861c8
Correct english word stemming test and add a test for french patch by doanduyhai; reviewed by xedin for CASSANDRA-12078 Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/eb82861c Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/eb82861c Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/eb82861c Branch: refs/heads/trunk Commit: eb82861c8d4c497d64b5e61a1606bdd270e8e109 Parents: f1cabca Author: Pavel Yaskevich <[email protected]> Authored: Sun Jun 26 01:48:23 2016 -0700 Committer: Pavel Yaskevich <[email protected]> Committed: Sun Jun 26 01:50:32 2016 -0700 ---------------------------------------------------------------------- .../sasi/analyzer/filter/StemmingFilters.java | 2 +- .../french_skip_stop_words_before_stemming.txt | 1 + .../sasi/analyzer/StandardAnalyzerTest.java | 33 +++++++++++++++++++- 3 files changed, 34 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/eb82861c/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java index 9e098d1..cb840a8 100644 --- a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java +++ b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java @@ -37,7 +37,7 @@ public class StemmingFilters public String process(String input) throws Exception { - if (stemmer == null) + if (input == null || stemmer == null) return input; stemmer.setCurrent(input); return (stemmer.stem()) ? stemmer.getCurrent() : input; http://git-wip-us.apache.org/repos/asf/cassandra/blob/eb82861c/test/resources/tokenization/french_skip_stop_words_before_stemming.txt ---------------------------------------------------------------------- diff --git a/test/resources/tokenization/french_skip_stop_words_before_stemming.txt b/test/resources/tokenization/french_skip_stop_words_before_stemming.txt new file mode 100644 index 0000000..59a1c23 --- /dev/null +++ b/test/resources/tokenization/french_skip_stop_words_before_stemming.txt @@ -0,0 +1 @@ +"La danse sous la pluie" est une chanson connue \ No newline at end of file http://git-wip-us.apache.org/repos/asf/cassandra/blob/eb82861c/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java ---------------------------------------------------------------------- diff --git a/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java b/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java index e307512..7a88a3d 100644 --- a/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java +++ b/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java @@ -25,6 +25,8 @@ import java.util.Locale; import org.junit.Test; +import org.apache.cassandra.serializers.UTF8Serializer; + import static org.junit.Assert.assertEquals; public class StandardAnalyzerTest @@ -151,7 +153,36 @@ public class StandardAnalyzerTest while (tokenizer.hasNext()) tokens.add(tokenizer.next()); - assertEquals(40249, tokens.size()); + assertEquals(37739, tokens.size()); + } + + @Test + public void testSkipStopWordBeforeStemmingFrench() throws Exception + { + InputStream is = StandardAnalyzerTest.class.getClassLoader() + .getResourceAsStream("tokenization/french_skip_stop_words_before_stemming.txt"); + + StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true) + .ignoreStopTerms(true).useLocale(Locale.FRENCH) + .alwaysLowerCaseTerms(true).build(); + StandardAnalyzer tokenizer = new StandardAnalyzer(); + tokenizer.init(options); + + List<ByteBuffer> tokens = new ArrayList<>(); + List<String> words = new ArrayList<>(); + tokenizer.reset(is); + while (tokenizer.hasNext()) + { + final ByteBuffer nextToken = tokenizer.next(); + tokens.add(nextToken); + words.add(UTF8Serializer.instance.deserialize(nextToken.duplicate())); + } + + assertEquals(4, tokens.size()); + assertEquals("dans", words.get(0)); + assertEquals("plui", words.get(1)); + assertEquals("chanson", words.get(2)); + assertEquals("connu", words.get(3)); } @Test
