This is an automated email from the ASF dual-hosted git repository.

fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 53c196460c OAK-11603: lucene 4.x fuzzy queries don't work in Elastic 
(#2180)
53c196460c is described below

commit 53c196460c03383e743a727ddb4db4234161aaaf
Author: Fabrizio Fortino <[email protected]>
AuthorDate: Fri Mar 14 17:26:26 2025 +0100

    OAK-11603: lucene 4.x fuzzy queries don't work in Elastic (#2180)
    
    * OAK-11603: lucene 4.x fuzzy queries don't work in Elastic
    
    * OAK-11603: improve fuzzy conversion
---
 .../index/elastic/query/ElasticRequestHandler.java | 80 +++++++++++++++++++++-
 .../index/elastic/ElasticFullTextIndexTest.java    | 30 ++++++++
 .../oak/plugins/index/FullTextIndexCommonTest.java | 34 ++++++++-
 3 files changed, 139 insertions(+), 5 deletions(-)

diff --git 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
index 6b9d71eec2..713d7d31f5 100644
--- 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
+++ 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
@@ -104,6 +104,8 @@ import java.util.function.BiConsumer;
 import java.util.function.BiPredicate;
 import java.util.function.Consumer;
 import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
@@ -126,6 +128,11 @@ public class ElasticRequestHandler {
     private static final String HIGHLIGHT_PREFIX = "<strong>";
     private static final String HIGHLIGHT_SUFFIX = "</strong>";
 
+    // Match Lucene 4.x fuzzy queries (e.g., roam~0.8), but not 5.x and beyond 
(e.g., roam~2)
+    private static final Pattern LUCENE_4_FUZZY_PATTERN = 
Pattern.compile("\\b(\\w+)~([0-9]*\\.?[0-9]+)\\b");
+    // From Lucene 5 and above (used by elastic), the fuzzy query syntax has 
changed to use a single integer
+    private static final Pattern ELASTIC_FUZZY_PATTERN = 
Pattern.compile("\\b(\\w+)~([0-2])\\b");
+
     private final IndexPlan indexPlan;
     private final Filter filter;
     private final PlanResult planResult;
@@ -889,10 +896,10 @@ public class ElasticRequestHandler {
         return Query.of(q -> q.multiMatch(m -> m.fields(uuid)));
     }
 
-    private static QueryStringQuery.Builder fullTextQuery(String text, String 
fieldName, PlanResult pr, boolean includeDynamicBoostedValues) {
+    private QueryStringQuery.Builder fullTextQuery(String text, String 
fieldName, PlanResult pr, boolean includeDynamicBoostedValues) {
         LOG.debug("fullTextQuery for text: '{}', fieldName: '{}'", text, 
fieldName);
         QueryStringQuery.Builder qsqBuilder = new QueryStringQuery.Builder()
-                .query(FulltextIndex.rewriteQueryText(text))
+                .query(rewriteQueryText(text))
                 .defaultOperator(Operator.And)
                 .type(TextQueryType.CrossFields)
                 .tieBreaker(0.5d);
@@ -908,6 +915,75 @@ public class ElasticRequestHandler {
         return qsqBuilder.fields(fieldName);
     }
 
+    private String rewriteQueryText(String text) {
+        String rewritten = FulltextIndex.rewriteQueryText(text);
+
+        // here we handle special cases where the syntax used in the lucene 
4.x query parser is not supported by the current version
+        rewritten = convertFuzzyQuery(rewritten);
+
+        return rewritten;
+    }
+
+    /**
+     * Converts Lucene fuzzy queries from the old syntax (float similarity) to 
the new syntax (edit distance).
+     * <p>
+     * In Lucene 4, fuzzy queries were specified using a floating-point 
similarity (e.g., "term~0.8"), where values
+     * closer to 1 required a higher similarity match. In later Lucene 
versions, this was replaced with a discrete
+     * edit distance (0, 1, or 2).
+     * <p>
+     * This method:
+     * <ul>
+     *   <li>Detects and converts old fuzzy queries (e.g., "roam~0.7" → 
"roam~1").</li>
+     *   <li>Preserves new fuzzy queries (e.g., "test~2" remains 
unchanged).</li>
+     *   <li>Avoids modifying proximity queries (e.g., "\"quick fox\"~5" 
remains unchanged).</li>
+     * </ul>
+     *
+     * @param text The input query string containing fuzzy or proximity 
queries.
+     * @return A query string where old fuzzy syntax is converted to the new 
format.
+     */
+    private String convertFuzzyQuery(String text) {
+        if (!text.contains("~")) {
+            return text;
+        }
+        Matcher lucene4FuzzyMatcher = LUCENE_4_FUZZY_PATTERN.matcher(text);
+
+        if (!lucene4FuzzyMatcher.find()) {
+            // this can only happen if the pattern is not found, which means 
we are dealing with a tilde not related to a fuzzy query
+            return text;
+        }
+
+        StringBuilder result = new StringBuilder();
+        do {
+            String term = lucene4FuzzyMatcher.group(1);
+            String fuzzyValue = lucene4FuzzyMatcher.group(2);
+
+            // Skip if it's already using the new syntax (integer 0-2)
+            if (ELASTIC_FUZZY_PATTERN.matcher(term + "~" + 
fuzzyValue).matches()) {
+                continue;
+            }
+
+            // Convert floating-point similarity to integer edit distance
+            int editDistance = 2; // Default to the most lenient setting
+            try {
+                float similarity = Float.parseFloat(fuzzyValue);
+                if (similarity >= 0.8f) {
+                    editDistance = 0;
+                } else if (similarity >= 0.5f) {
+                    editDistance = 1;
+                }
+            } catch (NumberFormatException e) {
+                LOG.warn("Invalid fuzzy value: {} for query text {}, using 
default edit distance of 2", fuzzyValue, text);
+            }
+
+            lucene4FuzzyMatcher.appendReplacement(result, term + "~" + 
editDistance);
+        } while (lucene4FuzzyMatcher.find());
+
+        lucene4FuzzyMatcher.appendTail(result);
+        String resultString = result.toString();
+        LOG.info("Converted fuzzy query from '{}' to '{}'", text, 
resultString);
+        return resultString;
+    }
+
     private Query createQuery(String propertyName, Filter.PropertyRestriction 
pr, PropertyDefinition defn) {
         final String field = 
elasticIndexDefinition.getElasticKeyword(propertyName);
 
diff --git 
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticFullTextIndexTest.java
 
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticFullTextIndexTest.java
index 37885b7502..457add8e22 100644
--- 
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticFullTextIndexTest.java
+++ 
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticFullTextIndexTest.java
@@ -17,8 +17,15 @@
 package org.apache.jackrabbit.oak.plugins.index.elastic;
 
 import org.apache.jackrabbit.oak.api.ContentRepository;
+import org.apache.jackrabbit.oak.api.Tree;
 import org.apache.jackrabbit.oak.plugins.index.FullTextIndexCommonTest;
 import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.util.List;
+
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.hamcrest.MatcherAssert.assertThat;
 
 public class ElasticFullTextIndexTest extends FullTextIndexCommonTest {
 
@@ -40,4 +47,27 @@ public class ElasticFullTextIndexTest extends 
FullTextIndexCommonTest {
         setTraversalEnabled(false);
     }
 
+    @Test
+    public void fullTextWithFuzzyEditDistance() throws Exception {
+        Tree index = setup(builder -> 
builder.indexRule("nt:base").property("propa").analyzed(), idx -> {
+                },
+                "propa");
+
+        //add content
+        Tree test = root.getTree("/").addChild("test");
+
+        test.addChild("a").setProperty("propa", "Hello World!");
+        test.addChild("b").setProperty("propa", "Simple test");
+        root.commit();
+
+        String misspelledWorld = "//*[jcr:contains(@propa, 'wordl~0.5')]";
+        String mixedFuzzyFormats = "//*[jcr:contains(@propa, 'wordl~0.5 OR 
sample~1')]";
+
+        assertEventually(() -> {
+            assertThat(explain(misspelledWorld, XPATH), 
containsString(indexOptions.getIndexType() + ":" + index.getName()));
+            assertQuery(misspelledWorld, XPATH, List.of("/test/a"));
+            assertQuery(mixedFuzzyFormats, XPATH, List.of("/test/a", 
"/test/b"));
+        });
+    }
+
 }
diff --git 
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextIndexCommonTest.java
 
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextIndexCommonTest.java
index abc2a6b210..859acaf435 100644
--- 
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextIndexCommonTest.java
+++ 
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextIndexCommonTest.java
@@ -75,7 +75,8 @@ public abstract class FullTextIndexCommonTest extends 
AbstractQueryTest {
         test.addChild("a").setProperty("propa", "Hello everyone. This is a 
fulltext test");
         root.commit();
 
-        // fuzziness support the following syntax: <term>~[edit_distance] (eg: 
hello~2). The query below is invalid
+        // fuzziness support the following syntax: <term>~[edit_distance] (eg: 
hello~[similarity value]). The query below is invalid
+        // 
https://lucene.apache.org/core/2_9_4/queryparsersyntax.html#Fuzzy%20Searches
         // 
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-fuzziness
         String query = "//*[jcr:contains(@propa, 'hello e~one')]";
 
@@ -85,6 +86,33 @@ public abstract class FullTextIndexCommonTest extends 
AbstractQueryTest {
         });
     }
 
+    @Test
+    public void fullTextWithFuzziness() throws Exception {
+        Tree index = setup(builder -> 
builder.indexRule("nt:base").property("propa").analyzed(), idx -> {
+                },
+                "propa");
+
+        //add content
+        Tree test = root.getTree("/").addChild("test");
+
+        test.addChild("a").setProperty("propa", "Hello World!");
+        test.addChild("b").setProperty("propa", "hello~folks!");
+        test.addChild("c").setProperty("propa", "Hello everyone!");
+        root.commit();
+
+        String misspelledWorld = "//*[jcr:contains(@propa, 'wordl~0.5')]";
+        String multipleMisspelledWorlds = "//*[jcr:contains(@propa, 'wordl~0.5 
OR everone~0.5')]";
+        String withTilde = "//*[jcr:contains(@propa, 'hello\\~folks')]";
+
+        assertEventually(() -> {
+            assertThat(explain(misspelledWorld, XPATH), 
containsString(indexOptions.getIndexType() + ":" + index.getName()));
+
+            assertQuery(misspelledWorld, XPATH, List.of("/test/a"));
+            assertQuery(multipleMisspelledWorlds, XPATH, List.of("/test/a", 
"/test/c"));
+            assertQuery(withTilde, XPATH, List.of("/test/b"));
+        });
+    }
+
     @Test
     public void fullTextQueryRegExp() throws Exception {
         Tree index = setup(builder -> 
builder.indexRule("nt:base").property("propa").analyzed(), idx -> {
@@ -318,7 +346,7 @@ public abstract class FullTextIndexCommonTest extends 
AbstractQueryTest {
         );
     }
 
-    private Tree setup(Consumer<IndexDefinitionBuilder> builderHook, 
Consumer<Tree> indexHook, String... propNames) throws Exception {
+    protected Tree setup(Consumer<IndexDefinitionBuilder> builderHook, 
Consumer<Tree> indexHook, String... propNames) throws Exception {
         IndexDefinitionBuilder builder = indexOptions.createIndex(
                 indexOptions.createIndexDefinitionBuilder(), false, propNames);
         builder.noAsync();
@@ -332,7 +360,7 @@ public abstract class FullTextIndexCommonTest extends 
AbstractQueryTest {
         return index;
     }
 
-    private String explain(String query, String lang) {
+    protected String explain(String query, String lang) {
         String explain = "explain " + query;
         return executeQuery(explain, lang).get(0);
     }

Reply via email to