This is an automated email from the ASF dual-hosted git repository.

fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 958e1ff69d OAK-10372: oak-search-elastic: similarity queries produce 
no relevant results (#1046)
958e1ff69d is described below

commit 958e1ff69d0d22524dca0920c91f2605a447e38d
Author: Fabrizio Fortino <[email protected]>
AuthorDate: Fri Aug 4 16:45:21 2023 +0200

    OAK-10372: oak-search-elastic: similarity queries produce no relevant 
results (#1046)
    
    * OAK-10372: oak-search-elastic: similarity queries produce no relevant 
results
    
    * OAK-10372: (minor) improved comments
    
    * OAK-10372: make similarity fields configurable
---
 .../index/elastic/ElasticIndexDefinition.java      | 26 ++++++++++++++
 .../index/elastic/query/ElasticRequestHandler.java |  8 ++++-
 .../index/elastic/ElasticIndexQueryCommonTest.java | 42 ++++++++++++++++++++++
 .../oak/plugins/index/IndexQueryCommonTest.java    |  6 ++--
 4 files changed, 78 insertions(+), 4 deletions(-)

diff --git 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
index 61656c4296..70a975760b 100644
--- 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
+++ 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
@@ -104,6 +104,17 @@ public class ElasticIndexDefinition extends 
IndexDefinition {
     private static final String SIMILARITY_TAGS_ENABLED = 
"similarityTagsEnabled";
     private static final boolean SIMILARITY_TAGS_ENABLED_DEFAULT = true;
 
+    private static final String SIMILARITY_TAGS_FIELDS = 
"similarityTagsFields";
+
+    // MLT queries, when no fields are specified, do not use the entire 
document but only a maximum of
+    // max_query_terms (default 25). Even increasing this value, the query 
could produce not so relevant
+    // results (eg: based on the :fulltext content). To work this around, we 
can specify DYNAMIC_BOOST_FULLTEXT
+    // field as first field since it usually contains relevant terms. This 
will make sure that the MLT queries
+    // give more priority to the terms in this field while the rest (*) are 
considered secondary.
+    private static final String[] SIMILARITY_TAGS_FIELDS_DEFAULT = new 
String[] {
+            DYNAMIC_BOOST_FULLTEXT, "*"
+    };
+
     private static final String SIMILARITY_TAGS_BOOST = "similarityTagsBoost";
     private static final float SIMILARITY_TAGS_BOOST_DEFAULT = 0.5f;
 
@@ -136,6 +147,8 @@ public class ElasticIndexDefinition extends IndexDefinition 
{
     private final Map<String, List<PropertyDefinition>> propertiesByName;
     private final List<PropertyDefinition> dynamicBoostProperties;
     private final List<PropertyDefinition> similarityProperties;
+    private final List<PropertyDefinition> similarityTagsProperties;
+    private final String[] similarityTagsFields;
 
     public ElasticIndexDefinition(NodeState root, NodeState defn, String 
indexPath, String indexPrefix) {
         super(root, defn, determineIndexFormatVersion(defn), 
determineUniqueId(defn), indexPath);
@@ -157,6 +170,7 @@ public class ElasticIndexDefinition extends IndexDefinition 
{
         this.failOnError = getOptionalValue(defn, FAIL_ON_ERROR,
                 Boolean.parseBoolean(System.getProperty(TYPE_ELASTICSEARCH + 
"." + FAIL_ON_ERROR, Boolean.toString(FAIL_ON_ERROR_DEFAULT)))
         );
+        this.similarityTagsFields = getOptionalValues(defn, 
SIMILARITY_TAGS_FIELDS, Type.STRINGS, String.class, 
SIMILARITY_TAGS_FIELDS_DEFAULT);
 
         this.propertiesByName = getDefinedRules()
                 .stream()
@@ -181,6 +195,10 @@ public class ElasticIndexDefinition extends 
IndexDefinition {
                 .stream()
                 .flatMap(rule -> rule.getSimilarityProperties().stream())
                 .collect(Collectors.toList());
+
+        this.similarityTagsProperties = propertiesByName.values().stream()
+                .flatMap(List::stream)
+                .filter(pd -> pd.similarityTags).collect(Collectors.toList());
     }
 
     @Nullable
@@ -213,6 +231,14 @@ public class ElasticIndexDefinition extends 
IndexDefinition {
         return similarityProperties;
     }
 
+    public List<PropertyDefinition> getSimilarityTagsProperties() {
+        return similarityTagsProperties;
+    }
+
+    public String[] getSimilarityTagsFields() {
+        return similarityTagsFields;
+    }
+
     public boolean areSimilarityTagsEnabled() {
         return similarityTagsEnabled;
     }
diff --git 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
index f64583f5d8..9c6c777345 100644
--- 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
+++ 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
@@ -36,6 +36,7 @@ import java.io.IOException;
 import java.io.StringReader;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Base64;
 import java.util.HashMap;
 import java.util.List;
@@ -183,7 +184,10 @@ public class ElasticRequestHandler {
                                     fnb.must(m -> 
m.bool(similarityQuery(queryNodePath, sp)));
                                 }
 
-                                if 
(elasticIndexDefinition.areSimilarityTagsEnabled()) {
+                                // Add should clause to improve relevance 
using similarity tags only when similarity is
+                                // enabled and there is at least one 
similarity tag property
+                                if 
(elasticIndexDefinition.areSimilarityTagsEnabled() &&
+                                        
!elasticIndexDefinition.getSimilarityTagsProperties().isEmpty()) {
                                     // add should clause to improve relevance 
using similarity tags
                                     fnb.should(s -> s
                                             .moreLikeThis(m -> m
@@ -393,6 +397,8 @@ public class ElasticRequestHandler {
                     // this is needed to workaround 
https://github.com/elastic/elasticsearch/pull/94518 that causes empty
                     // results when the _ignored metadata field is part of the 
input document
                     .perFieldAnalyzer("_ignored", "keyword")));
+            // when no fields are specified, we set the ones from the index 
definition
+            
mlt.fields(Arrays.asList(elasticIndexDefinition.getSimilarityTagsFields()));
         } else {
             // This is for native queries if someone sends additional fields 
via
             // mlt.fl=field1,field2
diff --git 
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
 
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
index 5f05e98ae3..02409cfb40 100644
--- 
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
+++ 
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
@@ -17,10 +17,17 @@
 package org.apache.jackrabbit.oak.plugins.index.elastic;
 
 import org.apache.jackrabbit.oak.InitialContentHelper;
+import org.apache.jackrabbit.oak.api.CommitFailedException;
 import org.apache.jackrabbit.oak.api.ContentRepository;
+import org.apache.jackrabbit.oak.api.Tree;
 import org.apache.jackrabbit.oak.plugins.index.IndexQueryCommonTest;
+import org.apache.jackrabbit.oak.plugins.index.TestUtil;
+import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
 import org.apache.jackrabbit.oak.plugins.memory.MemoryNodeStore;
 import org.junit.ClassRule;
+import org.junit.Test;
+
+import javax.jcr.query.Query;
 
 public class ElasticIndexQueryCommonTest extends IndexQueryCommonTest {
 
@@ -40,6 +47,41 @@ public class ElasticIndexQueryCommonTest extends 
IndexQueryCommonTest {
         return repositoryOptionsUtil.getOak().createContentRepository();
     }
 
+    @Test
+    public void similarityQueryShouldCorrectlyHandleSimilarityTags() throws 
CommitFailedException {
+        String query = "explain select [jcr:path] from [nt:base] where " +
+                "native('lucene', 
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')";
+
+        String explainWithoutSimilarityTags = "[nt:base] as [nt:base] /* 
elasticsearch:test-index(/oak:index/test-index) 
{\"bool\":{\"must\":[{\"more_like_this\":{\"fields\":[\":dynamic-boost-ft\",\"*\"],\"include\":true,\"like\":[{\"_id\":\"/test/a\",\"per_field_analyzer\":{\"_ignored\":\"keyword\"}}],\"min_doc_freq\":0,\"min_term_freq\":0}}]}}"
 +
+                " where native([nt:base], [lucene], 
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0') */";
+
+        Tree test = root.getTree("/").addChild("test");
+        test.addChild("a").setProperty("text", "Hello World");
+        test.addChild("b").setProperty("text", "He said Hello and then the 
world said Hello as well.");
+        test.addChild("c").setProperty("text", "He said Hi.");
+
+        indexDefn.setProperty("similarityTagsEnabled", false);
+        root.commit();
+
+        // similarity tags disabled, should not be present in the explain
+        assertEventually(getAssertionForExplain(query, Query.JCR_SQL2, 
explainWithoutSimilarityTags, true));
+
+        indexDefn.setProperty("similarityTagsEnabled", true);
+        root.commit();
+
+        // similarity tags enabled, but no similarity tags properties 
configured, should not be present in the explain
+        assertEventually(getAssertionForExplain(query, Query.JCR_SQL2, 
explainWithoutSimilarityTags, true));
+
+        String explainWithSimilarityTags = "[nt:base] as [nt:base] /* 
elasticsearch:test-index(/oak:index/test-index) 
{\"bool\":{\"must\":[{\"more_like_this\":{\"fields\":[\":dynamic-boost-ft\",\"*\"],\"include\":true,\"like\":[{\"_id\":\"/test/a\",\"per_field_analyzer\":{\"_ignored\":\"keyword\"}}],\"min_doc_freq\":0,\"min_term_freq\":0}}],\"should\":[{\"more_like_this\":{\"boost\":0.5,\"fields\":[\":simTags\"],\"like\":[{\"_id\":\"/test/a\"}],\"min_doc_freq\":1,\"min_term_freq\":1}}]}}"
 +
+                " where native([nt:base], [lucene], 
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0') */";
+        Tree properties = 
indexDefn.getChild(FulltextIndexConstants.INDEX_RULES).getChild("nt:base").getChild("properties");
+        Tree simProp = TestUtil.enableForFullText(properties, "simProp", 
false);
+        simProp.setProperty(FulltextIndexConstants.PROP_SIMILARITY_TAGS, true);
+        root.commit();
+
+        assertEventually(getAssertionForExplain(query, Query.JCR_SQL2, 
explainWithSimilarityTags, true));
+    }
+
     @Override
     public String getContainsValueForEqualityQuery_native() {
         return 
"\"filter\":[{\"term\":{\":ancestors\":{\"value\":\"/test\"}}},{\"term\":{\"propa.keyword\":{\"value\":\"bar\"}}}]";
diff --git 
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/IndexQueryCommonTest.java
 
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/IndexQueryCommonTest.java
index ec9009fa46..fa62cbb4cc 100644
--- 
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/IndexQueryCommonTest.java
+++ 
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/IndexQueryCommonTest.java
@@ -57,7 +57,7 @@ import static org.junit.Assert.fail;
  */
 public abstract class IndexQueryCommonTest extends AbstractQueryTest {
 
-    private Tree indexDefn;
+    protected Tree indexDefn;
     protected IndexOptions indexOptions;
     protected TestRepository repositoryOptionsUtil;
     private LogCustomizer logCustomizer;
@@ -813,7 +813,7 @@ public abstract class IndexQueryCommonTest extends 
AbstractQueryTest {
 
     public abstract String 
getExplainValueForDescendantTestWithIndexTagExplain();
 
-    private Runnable getAssertionForExplain(String query, String language, 
String expected, boolean matchComplete) {
+    protected Runnable getAssertionForExplain(String query, String language, 
String expected, boolean matchComplete) {
         return () -> {
             Result result = null;
             try {
@@ -830,7 +830,7 @@ public abstract class IndexQueryCommonTest extends 
AbstractQueryTest {
         };
     }
 
-    private static void assertEventually(Runnable r) {
+    protected static void assertEventually(Runnable r) {
         TestUtil.assertEventually(r, 3000 * 3);
     }
 }

Reply via email to