(jackrabbit-oak) branch trunk updated: OAK-10800: (oak-search-elastic) add mapping for DictionaryCompoundWord (#1450)

fortino Wed, 15 May 2024 01:40:57 -0700

This is an automated email from the ASF dual-hosted git repository.

fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git



The following commit(s) were added to refs/heads/trunk by this push:
     new 054741605c OAK-10800: (oak-search-elastic) add mapping for 
DictionaryCompoundWord (#1450)
054741605c is described below

commit 054741605c46031878dceaedb74ee45b7e82624c
Author: Fabrizio Fortino <[email protected]>
AuthorDate: Wed May 15 10:40:00 2024 +0200

    OAK-10800: (oak-search-elastic) add mapping for DictionaryCompoundWord 
(#1450)
    
    * OAK-10800: (oak-search-elastic) add mapping for DictionaryCompoundWord
    
    * OAK-10800: introduce a final step in LUCENE_ELASTIC_TRANSFORMERS to 
transform the keys from camel case to snake case
    
    * OAK-10800: WordDelimiterFilterFactory transformer
    
    * OAK-10800: improved WordDelimiterFilterFactory transformer
---
 .../index/ElasticCustomAnalyzerMappings.java       | 129 +++++++--------------
 .../plugins/index/FullTextAnalyzerCommonTest.java  |  24 ++++
 2 files changed, 69 insertions(+), 84 deletions(-)

diff --git 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzerMappings.java
 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzerMappings.java
index e8adc6f2e2..cd252226b2 100644
--- 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzerMappings.java
+++ 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzerMappings.java
@@ -16,23 +16,21 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.elastic.index;
 
+import org.apache.jackrabbit.guava.common.base.CaseFormat;
 import org.apache.lucene.analysis.AbstractAnalysisFactory;
 import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
 import org.apache.lucene.analysis.cjk.CJKBigramFilterFactory;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory;
+import 
org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
 import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
-import org.apache.lucene.analysis.minhash.MinHashFilterFactory;
-import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
 import org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory;
 import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory;
 import org.apache.lucene.analysis.miscellaneous.LengthFilterFactory;
-import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
 import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory;
 import org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory;
 import org.apache.lucene.analysis.ngram.NGramFilterFactory;
 import org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory;
 import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory;
-import org.apache.lucene.analysis.shingle.ShingleFilterFactory;
 import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
 import org.apache.lucene.analysis.util.ElisionFilterFactory;
 import org.jetbrains.annotations.Nullable;
@@ -42,6 +40,8 @@ import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.function.BiFunction;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
 
 public class ElasticCustomAnalyzerMappings {
 
@@ -112,78 +112,27 @@ public class ElasticCustomAnalyzerMappings {
         LUCENE_ELASTIC_TRANSFORMERS = new LinkedHashMap<>();
 
         LUCENE_ELASTIC_TRANSFORMERS.put(WordDelimiterFilterFactory.class, 
luceneParams -> {
-            if (luceneParams.containsKey("generateWordParts")) {
-                luceneParams.put("generateWordParts", 
Integer.parseInt(luceneParams.get("generateWordParts").toString()) == 1);
-            }
-            if (luceneParams.containsKey("generateNumberParts")) {
-                luceneParams.put("generateNumberParts", 
Integer.parseInt(luceneParams.get("generateNumberParts").toString()) == 1);
-            }
-            if (luceneParams.containsKey("catenateWords")) {
-                luceneParams.put("catenateWords", 
Integer.parseInt(luceneParams.get("catenateWords").toString()) == 1);
-            }
-            if (luceneParams.containsKey("catenateNumbers")) {
-                luceneParams.put("catenateNumbers", 
Integer.parseInt(luceneParams.get("catenateNumbers").toString()) == 1);
-            }
-            if (luceneParams.containsKey("catenateAll")) {
-                luceneParams.put("catenateAll", 
Integer.parseInt(luceneParams.get("catenateAll").toString()) == 1);
-            }
-            if (luceneParams.containsKey("splitOnCaseChange")) {
-                luceneParams.put("splitOnCaseChange", 
Integer.parseInt(luceneParams.get("splitOnCaseChange").toString()) == 1);
-            }
-            if (luceneParams.containsKey("preserveOriginal")) {
-                luceneParams.put("preserveOriginal", 
Integer.parseInt(luceneParams.get("preserveOriginal").toString()) == 1);
-            }
-            if (luceneParams.containsKey("splitOnNumerics")) {
-                luceneParams.put("splitOnNumerics", 
Integer.parseInt(luceneParams.get("splitOnNumerics").toString()) == 1);
-            }
-            if (luceneParams.containsKey("stemEnglishPossessive")) {
-                luceneParams.put("stemEnglishPossessive", 
Integer.parseInt(luceneParams.get("stemEnglishPossessive").toString()) == 1);
-            }
+            Consumer<String> transformFlag = flag -> 
luceneParams.computeIfPresent(flag, (k, v) -> Integer.parseInt(v.toString()) == 
1);
+
+            transformFlag.accept("generateWordParts");
+            transformFlag.accept("generateNumberParts");
+            transformFlag.accept("catenateWords");
+            transformFlag.accept("catenateNumbers");
+            transformFlag.accept("catenateAll");
+            transformFlag.accept("splitOnCaseChange");
+            transformFlag.accept("preserveOriginal");
+            transformFlag.accept("splitOnNumerics");
+            transformFlag.accept("stemEnglishPossessive");
+
             return reKey.apply(luceneParams, Map.of(
-                    "generateWordParts", "generate_word_parts",
-                    "generateNumberParts", "generate_number_parts",
-                    "catenateWords", "catenate_words",
-                    "catenateNumbers", "catenate_numbers",
-                    "catenateAll", "catenate_all",
-                    "splitOnCaseChange", "split_on_case_change",
-                    "preserveOriginal", "preserve_original",
-                    "splitOnNumerics", "split_on_numerics",
-                    "stemEnglishPossessive", "stem_english_possessive",
                     "protectedTokens", "protected_words"
             ));
         });
 
-        LUCENE_ELASTIC_TRANSFORMERS.put(ShingleFilterFactory.class, 
luceneParams ->
-                reKey.apply(luceneParams, Map.of(
-                        "minShingleSize", "min_shingle_size",
-                        "maxShingleSize", "max_shingle_size",
-                        "outputUnigrams", "output_unigrams",
-                        "outputUnigramsIfNoShingles", 
"output_unigrams_if_no_shingles",
-                        "tokenSeparator", "token_separator",
-                        "fillerToken", "filler_token"
-                ))
-        );
-
         
LUCENE_ELASTIC_TRANSFORMERS.put(PatternCaptureGroupFilterFactory.class, 
luceneParams ->
                 reKey.apply(luceneParams, Map.of("pattern", "patterns"))
         );
 
-        LUCENE_ELASTIC_TRANSFORMERS.put(MinHashFilterFactory.class, 
luceneParams ->
-                reKey.apply(luceneParams, Map.of(
-                        "hashCount", "hash_count",
-                        "bucketCount", "bucket_count",
-                        "hashSetSize", "hash_set_size",
-                        "withRotation", "with_rotation"
-                ))
-        );
-
-        LUCENE_ELASTIC_TRANSFORMERS.put(LimitTokenCountFilterFactory.class, 
luceneParams ->
-                reKey.apply(luceneParams, Map.of(
-                        "maxTokenCount", "max_token_count",
-                        "consumeAllTokens", "consume_all_tokens"
-                ))
-        );
-
         LUCENE_ELASTIC_TRANSFORMERS.put(KeepWordFilterFactory.class, 
luceneParams ->
                 reKey.apply(luceneParams, Map.of(
                         "words", "keep_words",
@@ -230,10 +179,6 @@ public class ElasticCustomAnalyzerMappings {
                 reKey.apply(luceneParams, Map.of("protected", "keywords"))
         );
 
-        LUCENE_ELASTIC_TRANSFORMERS.put(ASCIIFoldingFilterFactory.class, 
luceneParams ->
-                reKey.apply(luceneParams, Map.of("preserveOriginal", 
"preserve_original"))
-        );
-
         LUCENE_ELASTIC_TRANSFORMERS.put(CJKBigramFilterFactory.class, 
luceneParams -> {
             List<String> ignored = new ArrayList<>();
             if (!Boolean.parseBoolean(luceneParams.getOrDefault("hal", 
true).toString())) {
@@ -251,28 +196,44 @@ public class ElasticCustomAnalyzerMappings {
             if (!ignored.isEmpty()) {
                 luceneParams.put("ignored_scripts", ignored);
             }
-            return reKey.apply(luceneParams, Map.of("outputUnigrams", 
"output_unigrams"));
+            return luceneParams;
         });
 
         LUCENE_ELASTIC_TRANSFORMERS.put(AbstractWordsFileFilterFactory.class, 
luceneParams -> {
             luceneParams.remove("enablePositionIncrements");
-            return reKey.apply(luceneParams, Map.of("words", "stopwords", 
"ignoreCase", "ignore_case"));
+            return reKey.apply(luceneParams, Map.of("words", "stopwords"));
         });
+
+        
LUCENE_ELASTIC_TRANSFORMERS.put(DictionaryCompoundWordTokenFilterFactory.class, 
luceneParams -> reKey.apply(luceneParams, Map.of(
+                "dictionary", "word_list"
+        )));
+
+        // default transformer executed as final step on all the filters to 
transform the keys from camel case to snake case
+        LUCENE_ELASTIC_TRANSFORMERS.put(AbstractAnalysisFactory.class, 
luceneParams ->
+                luceneParams.entrySet().stream()
+                        .collect(Collectors.toMap(
+                                entry -> 
CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, entry.getKey()),
+                                Map.Entry::getValue, // keep the original value
+                                (oldValue, newValue) -> oldValue, // in case 
of duplicate keys, keep the old value
+                                LinkedHashMap::new // preserve the original 
order
+                        ))
+        );
     }
 
     /*
      * Some filter names cannot be transformed from the original name. Here we 
map the exceptions
      */
-    protected static final Map<String, String> FILTERS = Map.of(
-            "porter_stem", "porter2",
-            "ascii_folding", "asciifolding",
-            "n_gram", "ngram",
-            "edge_n_gram", "edge_ngram",
-            "keep_word", "keep",
-            "k_stem", "kstem",
-            "limit_token_count", "limit",
-            "pattern_capture_group", "pattern_capture",
-            "reverse_string", "reverse",
-            "snowball_porter", "snowball"
+    protected static final Map<String, String> FILTERS = Map.ofEntries(
+            Map.entry("porter_stem", "porter2"),
+            Map.entry("ascii_folding", "asciifolding"),
+            Map.entry("n_gram", "ngram"),
+            Map.entry("edge_n_gram", "edge_ngram"),
+            Map.entry("keep_word", "keep"),
+            Map.entry("k_stem", "kstem"),
+            Map.entry("limit_token_count", "limit"),
+            Map.entry("pattern_capture_group", "pattern_capture"),
+            Map.entry("reverse_string", "reverse"),
+            Map.entry("snowball_porter", "snowball"),
+            Map.entry("dictionary_compound_word", "dictionary_decompounder")
     );
 }
diff --git 
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
 
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
index 8c2910d015..ea8450e0cf 100644
--- 
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
+++ 
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
@@ -760,6 +760,30 @@ public abstract class FullTextAnalyzerCommonTest extends 
AbstractQueryTest {
         assertEventually(() -> assertQuery("select * from [nt:base] where 
CONTAINS(*, 'quick brown')", List.of("/content/bar")));
     }
 
+    @Test
+    public void fulltextSearchWithDictionaryCompounderFilter() throws 
Exception {
+        setup(List.of("foo"), idx -> {
+            Tree anl = 
idx.addChild(FulltextIndexConstants.ANALYZERS).addChild(FulltextIndexConstants.ANL_DEFAULT);
+            
anl.addChild(FulltextIndexConstants.ANL_TOKENIZER).setProperty(FulltextIndexConstants.ANL_NAME,
 "Standard");
+
+            Tree filters = anl.addChild(FulltextIndexConstants.ANL_FILTERS);
+            Tree dd = addFilter(filters, "DictionaryCompoundWord");
+            dd.setProperty("dictionary", "words.txt");
+            dd.addChild("words.txt").addChild(JcrConstants.JCR_CONTENT)
+                    .setProperty(JcrConstants.JCR_DATA, 
"Donau\ndampf\nmeer\nschiff");
+        });
+
+        Tree content = root.getTree("/").addChild("content");
+        content.addChild("bar").setProperty("foo", "Donaudampfschiff");
+        content.addChild("baz").setProperty("foo", "some other content");
+        root.commit();
+
+        assertEventually(() -> {
+            assertQuery("select * from [nt:base] where CONTAINS(*, 'dampf')", 
List.of("/content/bar"));
+            assertQuery("select * from [nt:base] where CONTAINS(*, 'damp')", 
List.of());
+        });
+    }
+
     //OAK-4805
     @Test
     public void badIndexDefinitionShouldLetQEWork() throws Exception {

(jackrabbit-oak) branch trunk updated: OAK-10800: (oak-search-elastic) add mapping for DictionaryCompoundWord (#1450)

Reply via email to