This is an automated email from the ASF dual-hosted git repository.
fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new e50de3a317 OAK-10226: fix lucene->elastic conversion for WordDelimiter
(#1003)
e50de3a317 is described below
commit e50de3a31756d1884bf15aa9485379b4538c253c
Author: Fabrizio Fortino <[email protected]>
AuthorDate: Wed Jun 28 09:14:22 2023 +0200
OAK-10226: fix lucene->elastic conversion for WordDelimiter (#1003)
* OAK-10226: fix lucene->elastic conversion for WordDelimiter
* OAK-10226: missing rekey for protected_words
* OAK-10226: cleanup unused import
* OAK-10226: word_delimiter should rekey in the last step to allow elastic
native config
* OAK-10226: incorporate changes from @nasokan
---
.../index/ElasticCustomAnalyzerMappings.java | 43 ++++++++++++++++++++++
.../plugins/index/FullTextAnalyzerCommonTest.java | 37 +++++++++++++++++++
2 files changed, 80 insertions(+)
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzerMappings.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzerMappings.java
index 006260660e..ea79c50086 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzerMappings.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzerMappings.java
@@ -26,6 +26,7 @@ import
org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory;
import org.apache.lucene.analysis.miscellaneous.LengthFilterFactory;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
+import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory;
import org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory;
import org.apache.lucene.analysis.ngram.NGramFilterFactory;
import org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory;
@@ -110,6 +111,48 @@ public class ElasticCustomAnalyzerMappings {
LUCENE_ELASTIC_TRANSFORMERS = new LinkedHashMap<>();
+ LUCENE_ELASTIC_TRANSFORMERS.put(WordDelimiterFilterFactory.class,
luceneParams -> {
+ if (luceneParams.containsKey("generateWordParts")) {
+ luceneParams.put("generateWordParts",
Integer.parseInt(luceneParams.get("generateWordParts").toString()) == 1);
+ }
+ if (luceneParams.containsKey("generateNumberParts")) {
+ luceneParams.put("generateNumberParts",
Integer.parseInt(luceneParams.get("generateNumberParts").toString()) == 1);
+ }
+ if (luceneParams.containsKey("catenateWords")) {
+ luceneParams.put("catenateWords",
Integer.parseInt(luceneParams.get("catenateWords").toString()) == 1);
+ }
+ if (luceneParams.containsKey("catenateNumbers")) {
+ luceneParams.put("catenateNumbers",
Integer.parseInt(luceneParams.get("catenateNumbers").toString()) == 1);
+ }
+ if (luceneParams.containsKey("catenateAll")) {
+ luceneParams.put("catenateAll",
Integer.parseInt(luceneParams.get("catenateAll").toString()) == 1);
+ }
+ if (luceneParams.containsKey("splitOnCaseChange")) {
+ luceneParams.put("splitOnCaseChange",
Integer.parseInt(luceneParams.get("splitOnCaseChange").toString()) == 1);
+ }
+ if (luceneParams.containsKey("preserveOriginal")) {
+ luceneParams.put("preserveOriginal",
Integer.parseInt(luceneParams.get("preserveOriginal").toString()) == 1);
+ }
+ if (luceneParams.containsKey("splitOnNumerics")) {
+ luceneParams.put("splitOnNumerics",
Integer.parseInt(luceneParams.get("splitOnNumerics").toString()) == 1);
+ }
+ if (luceneParams.containsKey("stemEnglishPossessive")) {
+ luceneParams.put("stemEnglishPossessive",
Integer.parseInt(luceneParams.get("stemEnglishPossessive").toString()) == 1);
+ }
+ return reKey.apply(luceneParams, Map.of(
+ "generateWordParts", "generate_word_parts",
+ "generateNumberParts", "generate_number_parts",
+ "catenateWords", "catenate_words",
+ "catenateNumbers", "catenate_numbers",
+ "catenateAll", "catenate_all",
+ "splitOnCaseChange", "split_on_case_change",
+ "preserveOriginal", "preserve_original",
+ "splitOnNumerics", "split_on_numerics",
+ "stemEnglishPossessive", "stem_english_possessive",
+ "protectedTokens", "protected_words"
+ ));
+ });
+
LUCENE_ELASTIC_TRANSFORMERS.put(ShingleFilterFactory.class,
luceneParams ->
reKey.apply(luceneParams, Map.of(
"minShingleSize", "min_shingle_size",
diff --git
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
index 546b3caa8a..19060dd7f3 100644
---
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
+++
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
@@ -541,6 +541,43 @@ public abstract class FullTextAnalyzerCommonTest extends
AbstractQueryTest {
assertEventually(() -> assertQuery("select * from [nt:base] where
CONTAINS(*, 'brown')", List.of("/bar")));
}
+ @Test
+ public void fulltextSearchWithStemmingAndAsciiFilter() throws Exception {
+ setup(List.of("foo"), idx -> {
+ Tree anl =
idx.addChild(FulltextIndexConstants.ANALYZERS).addChild(FulltextIndexConstants.ANL_DEFAULT);
+
anl.addChild(FulltextIndexConstants.ANL_TOKENIZER).setProperty(FulltextIndexConstants.ANL_NAME,
"Standard");
+
+ Tree filters = anl.addChild(FulltextIndexConstants.ANL_FILTERS);
+ filters.addChild("LowerCase");
+ filters.addChild("ASCIIFolding");
+ Tree wordDelimiter = filters.addChild("WordDelimiter");
+ wordDelimiter.setProperty("generateWordParts", "1");
+ wordDelimiter.setProperty("stemEnglishPossessive", "1");
+ wordDelimiter.setProperty("generateNumberParts", "1");
+ wordDelimiter.setProperty("preserveOriginal", "0");
+ wordDelimiter.setProperty("splitOnCaseChange", "0");
+ wordDelimiter.setProperty("splitOnNumerics", "0");
+ wordDelimiter.setProperty("catenateWords", "0");
+ wordDelimiter.setProperty("catenateNumbers", "0");
+ wordDelimiter.setProperty("catenateAll", "0");
+ filters.addChild("PorterStem");
+ });
+
+ Tree test = root.getTree("/");
+ test.addChild("bar").setProperty("foo", "quick");
+ test.addChild("baz").setProperty("foo", "quick brown foxes");
+ // diacritic form
+ test.addChild("bat").setProperty("foo", "maße");
+ root.commit();
+
+ assertEventually(() -> {
+ assertQuery("select * from [nt:base] where CONTAINS(*, 'quick')",
List.of("/bar", "/baz"));
+ assertQuery("select * from [nt:base] where CONTAINS(*, 'foxes')",
List.of("/baz"));
+ assertQuery("select * from [nt:base] where CONTAINS(*, 'fox')",
List.of("/baz"));
+ assertQuery("select * from [nt:base] where CONTAINS(*, 'masse')",
List.of("/bat"));
+ });
+ }
+
@Test
public void fulltextSearchWithNGram() throws Exception {
setup(List.of("foo"), idx -> {