This is an automated email from the ASF dual-hosted git repository.
fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new 054741605c OAK-10800: (oak-search-elastic) add mapping for
DictionaryCompoundWord (#1450)
054741605c is described below
commit 054741605c46031878dceaedb74ee45b7e82624c
Author: Fabrizio Fortino <[email protected]>
AuthorDate: Wed May 15 10:40:00 2024 +0200
OAK-10800: (oak-search-elastic) add mapping for DictionaryCompoundWord
(#1450)
* OAK-10800: (oak-search-elastic) add mapping for DictionaryCompoundWord
* OAK-10800: introduce a final step in LUCENE_ELASTIC_TRANSFORMERS to
transform the keys from camel case to snake case
* OAK-10800: WordDelimiterFilterFactory transformer
* OAK-10800: improved WordDelimiterFilterFactory transformer
---
.../index/ElasticCustomAnalyzerMappings.java | 129 +++++++--------------
.../plugins/index/FullTextAnalyzerCommonTest.java | 24 ++++
2 files changed, 69 insertions(+), 84 deletions(-)
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzerMappings.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzerMappings.java
index e8adc6f2e2..cd252226b2 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzerMappings.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzerMappings.java
@@ -16,23 +16,21 @@
*/
package org.apache.jackrabbit.oak.plugins.index.elastic.index;
+import org.apache.jackrabbit.guava.common.base.CaseFormat;
import org.apache.lucene.analysis.AbstractAnalysisFactory;
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
import org.apache.lucene.analysis.cjk.CJKBigramFilterFactory;
import org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory;
+import
org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
-import org.apache.lucene.analysis.minhash.MinHashFilterFactory;
-import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
import org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory;
import org.apache.lucene.analysis.miscellaneous.LengthFilterFactory;
-import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory;
import org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory;
import org.apache.lucene.analysis.ngram.NGramFilterFactory;
import org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory;
-import org.apache.lucene.analysis.shingle.ShingleFilterFactory;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.util.ElisionFilterFactory;
import org.jetbrains.annotations.Nullable;
@@ -42,6 +40,8 @@ import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.function.BiFunction;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
public class ElasticCustomAnalyzerMappings {
@@ -112,78 +112,27 @@ public class ElasticCustomAnalyzerMappings {
LUCENE_ELASTIC_TRANSFORMERS = new LinkedHashMap<>();
LUCENE_ELASTIC_TRANSFORMERS.put(WordDelimiterFilterFactory.class,
luceneParams -> {
- if (luceneParams.containsKey("generateWordParts")) {
- luceneParams.put("generateWordParts",
Integer.parseInt(luceneParams.get("generateWordParts").toString()) == 1);
- }
- if (luceneParams.containsKey("generateNumberParts")) {
- luceneParams.put("generateNumberParts",
Integer.parseInt(luceneParams.get("generateNumberParts").toString()) == 1);
- }
- if (luceneParams.containsKey("catenateWords")) {
- luceneParams.put("catenateWords",
Integer.parseInt(luceneParams.get("catenateWords").toString()) == 1);
- }
- if (luceneParams.containsKey("catenateNumbers")) {
- luceneParams.put("catenateNumbers",
Integer.parseInt(luceneParams.get("catenateNumbers").toString()) == 1);
- }
- if (luceneParams.containsKey("catenateAll")) {
- luceneParams.put("catenateAll",
Integer.parseInt(luceneParams.get("catenateAll").toString()) == 1);
- }
- if (luceneParams.containsKey("splitOnCaseChange")) {
- luceneParams.put("splitOnCaseChange",
Integer.parseInt(luceneParams.get("splitOnCaseChange").toString()) == 1);
- }
- if (luceneParams.containsKey("preserveOriginal")) {
- luceneParams.put("preserveOriginal",
Integer.parseInt(luceneParams.get("preserveOriginal").toString()) == 1);
- }
- if (luceneParams.containsKey("splitOnNumerics")) {
- luceneParams.put("splitOnNumerics",
Integer.parseInt(luceneParams.get("splitOnNumerics").toString()) == 1);
- }
- if (luceneParams.containsKey("stemEnglishPossessive")) {
- luceneParams.put("stemEnglishPossessive",
Integer.parseInt(luceneParams.get("stemEnglishPossessive").toString()) == 1);
- }
+ Consumer<String> transformFlag = flag ->
luceneParams.computeIfPresent(flag, (k, v) -> Integer.parseInt(v.toString()) ==
1);
+
+ transformFlag.accept("generateWordParts");
+ transformFlag.accept("generateNumberParts");
+ transformFlag.accept("catenateWords");
+ transformFlag.accept("catenateNumbers");
+ transformFlag.accept("catenateAll");
+ transformFlag.accept("splitOnCaseChange");
+ transformFlag.accept("preserveOriginal");
+ transformFlag.accept("splitOnNumerics");
+ transformFlag.accept("stemEnglishPossessive");
+
return reKey.apply(luceneParams, Map.of(
- "generateWordParts", "generate_word_parts",
- "generateNumberParts", "generate_number_parts",
- "catenateWords", "catenate_words",
- "catenateNumbers", "catenate_numbers",
- "catenateAll", "catenate_all",
- "splitOnCaseChange", "split_on_case_change",
- "preserveOriginal", "preserve_original",
- "splitOnNumerics", "split_on_numerics",
- "stemEnglishPossessive", "stem_english_possessive",
"protectedTokens", "protected_words"
));
});
- LUCENE_ELASTIC_TRANSFORMERS.put(ShingleFilterFactory.class,
luceneParams ->
- reKey.apply(luceneParams, Map.of(
- "minShingleSize", "min_shingle_size",
- "maxShingleSize", "max_shingle_size",
- "outputUnigrams", "output_unigrams",
- "outputUnigramsIfNoShingles",
"output_unigrams_if_no_shingles",
- "tokenSeparator", "token_separator",
- "fillerToken", "filler_token"
- ))
- );
-
LUCENE_ELASTIC_TRANSFORMERS.put(PatternCaptureGroupFilterFactory.class,
luceneParams ->
reKey.apply(luceneParams, Map.of("pattern", "patterns"))
);
- LUCENE_ELASTIC_TRANSFORMERS.put(MinHashFilterFactory.class,
luceneParams ->
- reKey.apply(luceneParams, Map.of(
- "hashCount", "hash_count",
- "bucketCount", "bucket_count",
- "hashSetSize", "hash_set_size",
- "withRotation", "with_rotation"
- ))
- );
-
- LUCENE_ELASTIC_TRANSFORMERS.put(LimitTokenCountFilterFactory.class,
luceneParams ->
- reKey.apply(luceneParams, Map.of(
- "maxTokenCount", "max_token_count",
- "consumeAllTokens", "consume_all_tokens"
- ))
- );
-
LUCENE_ELASTIC_TRANSFORMERS.put(KeepWordFilterFactory.class,
luceneParams ->
reKey.apply(luceneParams, Map.of(
"words", "keep_words",
@@ -230,10 +179,6 @@ public class ElasticCustomAnalyzerMappings {
reKey.apply(luceneParams, Map.of("protected", "keywords"))
);
- LUCENE_ELASTIC_TRANSFORMERS.put(ASCIIFoldingFilterFactory.class,
luceneParams ->
- reKey.apply(luceneParams, Map.of("preserveOriginal",
"preserve_original"))
- );
-
LUCENE_ELASTIC_TRANSFORMERS.put(CJKBigramFilterFactory.class,
luceneParams -> {
List<String> ignored = new ArrayList<>();
if (!Boolean.parseBoolean(luceneParams.getOrDefault("hal",
true).toString())) {
@@ -251,28 +196,44 @@ public class ElasticCustomAnalyzerMappings {
if (!ignored.isEmpty()) {
luceneParams.put("ignored_scripts", ignored);
}
- return reKey.apply(luceneParams, Map.of("outputUnigrams",
"output_unigrams"));
+ return luceneParams;
});
LUCENE_ELASTIC_TRANSFORMERS.put(AbstractWordsFileFilterFactory.class,
luceneParams -> {
luceneParams.remove("enablePositionIncrements");
- return reKey.apply(luceneParams, Map.of("words", "stopwords",
"ignoreCase", "ignore_case"));
+ return reKey.apply(luceneParams, Map.of("words", "stopwords"));
});
+
+
LUCENE_ELASTIC_TRANSFORMERS.put(DictionaryCompoundWordTokenFilterFactory.class,
luceneParams -> reKey.apply(luceneParams, Map.of(
+ "dictionary", "word_list"
+ )));
+
+ // default transformer executed as final step on all the filters to
transform the keys from camel case to snake case
+ LUCENE_ELASTIC_TRANSFORMERS.put(AbstractAnalysisFactory.class,
luceneParams ->
+ luceneParams.entrySet().stream()
+ .collect(Collectors.toMap(
+ entry ->
CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, entry.getKey()),
+ Map.Entry::getValue, // keep the original value
+ (oldValue, newValue) -> oldValue, // in case
of duplicate keys, keep the old value
+ LinkedHashMap::new // preserve the original
order
+ ))
+ );
}
/*
* Some filter names cannot be transformed from the original name. Here we
map the exceptions
*/
- protected static final Map<String, String> FILTERS = Map.of(
- "porter_stem", "porter2",
- "ascii_folding", "asciifolding",
- "n_gram", "ngram",
- "edge_n_gram", "edge_ngram",
- "keep_word", "keep",
- "k_stem", "kstem",
- "limit_token_count", "limit",
- "pattern_capture_group", "pattern_capture",
- "reverse_string", "reverse",
- "snowball_porter", "snowball"
+ protected static final Map<String, String> FILTERS = Map.ofEntries(
+ Map.entry("porter_stem", "porter2"),
+ Map.entry("ascii_folding", "asciifolding"),
+ Map.entry("n_gram", "ngram"),
+ Map.entry("edge_n_gram", "edge_ngram"),
+ Map.entry("keep_word", "keep"),
+ Map.entry("k_stem", "kstem"),
+ Map.entry("limit_token_count", "limit"),
+ Map.entry("pattern_capture_group", "pattern_capture"),
+ Map.entry("reverse_string", "reverse"),
+ Map.entry("snowball_porter", "snowball"),
+ Map.entry("dictionary_compound_word", "dictionary_decompounder")
);
}
diff --git
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
index 8c2910d015..ea8450e0cf 100644
---
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
+++
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
@@ -760,6 +760,30 @@ public abstract class FullTextAnalyzerCommonTest extends
AbstractQueryTest {
assertEventually(() -> assertQuery("select * from [nt:base] where
CONTAINS(*, 'quick brown')", List.of("/content/bar")));
}
+ @Test
+ public void fulltextSearchWithDictionaryCompounderFilter() throws
Exception {
+ setup(List.of("foo"), idx -> {
+ Tree anl =
idx.addChild(FulltextIndexConstants.ANALYZERS).addChild(FulltextIndexConstants.ANL_DEFAULT);
+
anl.addChild(FulltextIndexConstants.ANL_TOKENIZER).setProperty(FulltextIndexConstants.ANL_NAME,
"Standard");
+
+ Tree filters = anl.addChild(FulltextIndexConstants.ANL_FILTERS);
+ Tree dd = addFilter(filters, "DictionaryCompoundWord");
+ dd.setProperty("dictionary", "words.txt");
+ dd.addChild("words.txt").addChild(JcrConstants.JCR_CONTENT)
+ .setProperty(JcrConstants.JCR_DATA,
"Donau\ndampf\nmeer\nschiff");
+ });
+
+ Tree content = root.getTree("/").addChild("content");
+ content.addChild("bar").setProperty("foo", "Donaudampfschiff");
+ content.addChild("baz").setProperty("foo", "some other content");
+ root.commit();
+
+ assertEventually(() -> {
+ assertQuery("select * from [nt:base] where CONTAINS(*, 'dampf')",
List.of("/content/bar"));
+ assertQuery("select * from [nt:base] where CONTAINS(*, 'damp')",
List.of());
+ });
+ }
+
//OAK-4805
@Test
public void badIndexDefinitionShouldLetQEWork() throws Exception {