This is an automated email from the ASF dual-hosted git repository.
fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new 53c196460c OAK-11603: lucene 4.x fuzzy queries don't work in Elastic
(#2180)
53c196460c is described below
commit 53c196460c03383e743a727ddb4db4234161aaaf
Author: Fabrizio Fortino <[email protected]>
AuthorDate: Fri Mar 14 17:26:26 2025 +0100
OAK-11603: lucene 4.x fuzzy queries don't work in Elastic (#2180)
* OAK-11603: lucene 4.x fuzzy queries don't work in Elastic
* OAK-11603: improve fuzzy conversion
---
.../index/elastic/query/ElasticRequestHandler.java | 80 +++++++++++++++++++++-
.../index/elastic/ElasticFullTextIndexTest.java | 30 ++++++++
.../oak/plugins/index/FullTextIndexCommonTest.java | 34 ++++++++-
3 files changed, 139 insertions(+), 5 deletions(-)
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
index 6b9d71eec2..713d7d31f5 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
@@ -104,6 +104,8 @@ import java.util.function.BiConsumer;
import java.util.function.BiPredicate;
import java.util.function.Consumer;
import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
@@ -126,6 +128,11 @@ public class ElasticRequestHandler {
private static final String HIGHLIGHT_PREFIX = "<strong>";
private static final String HIGHLIGHT_SUFFIX = "</strong>";
+ // Match Lucene 4.x fuzzy queries (e.g., roam~0.8), but not 5.x and beyond
(e.g., roam~2)
+ private static final Pattern LUCENE_4_FUZZY_PATTERN =
Pattern.compile("\\b(\\w+)~([0-9]*\\.?[0-9]+)\\b");
+ // From Lucene 5 and above (used by elastic), the fuzzy query syntax has
changed to use a single integer
+ private static final Pattern ELASTIC_FUZZY_PATTERN =
Pattern.compile("\\b(\\w+)~([0-2])\\b");
+
private final IndexPlan indexPlan;
private final Filter filter;
private final PlanResult planResult;
@@ -889,10 +896,10 @@ public class ElasticRequestHandler {
return Query.of(q -> q.multiMatch(m -> m.fields(uuid)));
}
- private static QueryStringQuery.Builder fullTextQuery(String text, String
fieldName, PlanResult pr, boolean includeDynamicBoostedValues) {
+ private QueryStringQuery.Builder fullTextQuery(String text, String
fieldName, PlanResult pr, boolean includeDynamicBoostedValues) {
LOG.debug("fullTextQuery for text: '{}', fieldName: '{}'", text,
fieldName);
QueryStringQuery.Builder qsqBuilder = new QueryStringQuery.Builder()
- .query(FulltextIndex.rewriteQueryText(text))
+ .query(rewriteQueryText(text))
.defaultOperator(Operator.And)
.type(TextQueryType.CrossFields)
.tieBreaker(0.5d);
@@ -908,6 +915,75 @@ public class ElasticRequestHandler {
return qsqBuilder.fields(fieldName);
}
+ private String rewriteQueryText(String text) {
+ String rewritten = FulltextIndex.rewriteQueryText(text);
+
+ // here we handle special cases where the syntax used in the lucene
4.x query parser is not supported by the current version
+ rewritten = convertFuzzyQuery(rewritten);
+
+ return rewritten;
+ }
+
+ /**
+ * Converts Lucene fuzzy queries from the old syntax (float similarity) to
the new syntax (edit distance).
+ * <p>
+ * In Lucene 4, fuzzy queries were specified using a floating-point
similarity (e.g., "term~0.8"), where values
+ * closer to 1 required a higher similarity match. In later Lucene
versions, this was replaced with a discrete
+ * edit distance (0, 1, or 2).
+ * <p>
+ * This method:
+ * <ul>
+ * <li>Detects and converts old fuzzy queries (e.g., "roam~0.7" →
"roam~1").</li>
+ * <li>Preserves new fuzzy queries (e.g., "test~2" remains
unchanged).</li>
+ * <li>Avoids modifying proximity queries (e.g., "\"quick fox\"~5"
remains unchanged).</li>
+ * </ul>
+ *
+ * @param text The input query string containing fuzzy or proximity
queries.
+ * @return A query string where old fuzzy syntax is converted to the new
format.
+ */
+ private String convertFuzzyQuery(String text) {
+ if (!text.contains("~")) {
+ return text;
+ }
+ Matcher lucene4FuzzyMatcher = LUCENE_4_FUZZY_PATTERN.matcher(text);
+
+ if (!lucene4FuzzyMatcher.find()) {
+ // this can only happen if the pattern is not found, which means
we are dealing with a tilde not related to a fuzzy query
+ return text;
+ }
+
+ StringBuilder result = new StringBuilder();
+ do {
+ String term = lucene4FuzzyMatcher.group(1);
+ String fuzzyValue = lucene4FuzzyMatcher.group(2);
+
+ // Skip if it's already using the new syntax (integer 0-2)
+ if (ELASTIC_FUZZY_PATTERN.matcher(term + "~" +
fuzzyValue).matches()) {
+ continue;
+ }
+
+ // Convert floating-point similarity to integer edit distance
+ int editDistance = 2; // Default to the most lenient setting
+ try {
+ float similarity = Float.parseFloat(fuzzyValue);
+ if (similarity >= 0.8f) {
+ editDistance = 0;
+ } else if (similarity >= 0.5f) {
+ editDistance = 1;
+ }
+ } catch (NumberFormatException e) {
+ LOG.warn("Invalid fuzzy value: {} for query text {}, using
default edit distance of 2", fuzzyValue, text);
+ }
+
+ lucene4FuzzyMatcher.appendReplacement(result, term + "~" +
editDistance);
+ } while (lucene4FuzzyMatcher.find());
+
+ lucene4FuzzyMatcher.appendTail(result);
+ String resultString = result.toString();
+ LOG.info("Converted fuzzy query from '{}' to '{}'", text,
resultString);
+ return resultString;
+ }
+
private Query createQuery(String propertyName, Filter.PropertyRestriction
pr, PropertyDefinition defn) {
final String field =
elasticIndexDefinition.getElasticKeyword(propertyName);
diff --git
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticFullTextIndexTest.java
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticFullTextIndexTest.java
index 37885b7502..457add8e22 100644
---
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticFullTextIndexTest.java
+++
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticFullTextIndexTest.java
@@ -17,8 +17,15 @@
package org.apache.jackrabbit.oak.plugins.index.elastic;
import org.apache.jackrabbit.oak.api.ContentRepository;
+import org.apache.jackrabbit.oak.api.Tree;
import org.apache.jackrabbit.oak.plugins.index.FullTextIndexCommonTest;
import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.util.List;
+
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.hamcrest.MatcherAssert.assertThat;
public class ElasticFullTextIndexTest extends FullTextIndexCommonTest {
@@ -40,4 +47,27 @@ public class ElasticFullTextIndexTest extends
FullTextIndexCommonTest {
setTraversalEnabled(false);
}
+ @Test
+ public void fullTextWithFuzzyEditDistance() throws Exception {
+ Tree index = setup(builder ->
builder.indexRule("nt:base").property("propa").analyzed(), idx -> {
+ },
+ "propa");
+
+ //add content
+ Tree test = root.getTree("/").addChild("test");
+
+ test.addChild("a").setProperty("propa", "Hello World!");
+ test.addChild("b").setProperty("propa", "Simple test");
+ root.commit();
+
+ String misspelledWorld = "//*[jcr:contains(@propa, 'wordl~0.5')]";
+ String mixedFuzzyFormats = "//*[jcr:contains(@propa, 'wordl~0.5 OR
sample~1')]";
+
+ assertEventually(() -> {
+ assertThat(explain(misspelledWorld, XPATH),
containsString(indexOptions.getIndexType() + ":" + index.getName()));
+ assertQuery(misspelledWorld, XPATH, List.of("/test/a"));
+ assertQuery(mixedFuzzyFormats, XPATH, List.of("/test/a",
"/test/b"));
+ });
+ }
+
}
diff --git
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextIndexCommonTest.java
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextIndexCommonTest.java
index abc2a6b210..859acaf435 100644
---
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextIndexCommonTest.java
+++
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextIndexCommonTest.java
@@ -75,7 +75,8 @@ public abstract class FullTextIndexCommonTest extends
AbstractQueryTest {
test.addChild("a").setProperty("propa", "Hello everyone. This is a
fulltext test");
root.commit();
- // fuzziness support the following syntax: <term>~[edit_distance] (eg:
hello~2). The query below is invalid
+ // fuzziness support the following syntax: <term>~[edit_distance] (eg:
hello~[similarity value]). The query below is invalid
+ //
https://lucene.apache.org/core/2_9_4/queryparsersyntax.html#Fuzzy%20Searches
//
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-fuzziness
String query = "//*[jcr:contains(@propa, 'hello e~one')]";
@@ -85,6 +86,33 @@ public abstract class FullTextIndexCommonTest extends
AbstractQueryTest {
});
}
+ @Test
+ public void fullTextWithFuzziness() throws Exception {
+ Tree index = setup(builder ->
builder.indexRule("nt:base").property("propa").analyzed(), idx -> {
+ },
+ "propa");
+
+ //add content
+ Tree test = root.getTree("/").addChild("test");
+
+ test.addChild("a").setProperty("propa", "Hello World!");
+ test.addChild("b").setProperty("propa", "hello~folks!");
+ test.addChild("c").setProperty("propa", "Hello everyone!");
+ root.commit();
+
+ String misspelledWorld = "//*[jcr:contains(@propa, 'wordl~0.5')]";
+ String multipleMisspelledWorlds = "//*[jcr:contains(@propa, 'wordl~0.5
OR everone~0.5')]";
+ String withTilde = "//*[jcr:contains(@propa, 'hello\\~folks')]";
+
+ assertEventually(() -> {
+ assertThat(explain(misspelledWorld, XPATH),
containsString(indexOptions.getIndexType() + ":" + index.getName()));
+
+ assertQuery(misspelledWorld, XPATH, List.of("/test/a"));
+ assertQuery(multipleMisspelledWorlds, XPATH, List.of("/test/a",
"/test/c"));
+ assertQuery(withTilde, XPATH, List.of("/test/b"));
+ });
+ }
+
@Test
public void fullTextQueryRegExp() throws Exception {
Tree index = setup(builder ->
builder.indexRule("nt:base").property("propa").analyzed(), idx -> {
@@ -318,7 +346,7 @@ public abstract class FullTextIndexCommonTest extends
AbstractQueryTest {
);
}
- private Tree setup(Consumer<IndexDefinitionBuilder> builderHook,
Consumer<Tree> indexHook, String... propNames) throws Exception {
+ protected Tree setup(Consumer<IndexDefinitionBuilder> builderHook,
Consumer<Tree> indexHook, String... propNames) throws Exception {
IndexDefinitionBuilder builder = indexOptions.createIndex(
indexOptions.createIndexDefinitionBuilder(), false, propNames);
builder.noAsync();
@@ -332,7 +360,7 @@ public abstract class FullTextIndexCommonTest extends
AbstractQueryTest {
return index;
}
- private String explain(String query, String lang) {
+ protected String explain(String query, String lang) {
String explain = "explain " + query;
return executeQuery(explain, lang).get(0);
}