This is an automated email from the ASF dual-hosted git repository.

fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 931dfffec6 OAK-10762: dynamic boosted tags need to specify extra 
parameters (#1416)
931dfffec6 is described below

commit 931dfffec63ee13bf403a4ad76587b5371669421
Author: Fabrizio Fortino <[email protected]>
AuthorDate: Tue Apr 23 08:28:13 2024 +0200

    OAK-10762: dynamic boosted tags need to specify extra parameters (#1416)
    
    * OAK-10762: dynamic boosted tags need to specify extra parameters
    
    * OAK-10762: remove workaround for 
https://github.com/elastic/elasticsearch/pull/94518
    
    * OAK-10762: add todo on how to further improve relevance
    
    * OAK-10762: fix failing tests
    
    * OAK-10762: typo
    
    * OAK-10762: improved todo comment
---
 .../plugins/index/elastic/ElasticIndexDefinition.java  |  9 ++++++---
 .../index/elastic/query/ElasticRequestHandler.java     | 18 +++++++++++-------
 .../index/elastic/ElasticIndexQueryCommonTest.java     |  4 ++--
 .../plugins/index/elastic/ElasticSimilarQueryTest.java |  3 +--
 .../oak/plugins/index/DynamicBoostCommonTest.java      | 10 ++++++++++
 5 files changed, 30 insertions(+), 14 deletions(-)

diff --git 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
index 839e7d5559..844c8aa990 100644
--- 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
+++ 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
@@ -118,10 +118,13 @@ public class ElasticIndexDefinition extends 
IndexDefinition {
     // MLT queries, when no fields are specified, do not use the entire 
document but only a maximum of
     // max_query_terms (default 25). Even increasing this value, the query 
could produce not so relevant
     // results (eg: based on the :fulltext content). To work this around, we 
can specify DYNAMIC_BOOST_FULLTEXT
-    // field as first field since it usually contains relevant terms. This 
will make sure that the MLT queries
-    // give more priority to the terms in this field while the rest (*) are 
considered secondary.
+    // field with overridden mlt params and increased boost since it usually 
contains relevant terms. This will make sure
+    // that the MLT queries give more priority to the terms in this field 
while the rest (*) are considered secondary.
+    // TODO: we can further improve search relevance by using the actual tags 
combined with the weights using a function query.
+    //      Right now, we are just matching the tags without looking at the 
weights. Therefore, a tag can be matched in a field with a lower weight.
     private static final String[] SIMILARITY_TAGS_FIELDS_DEFAULT = new 
String[] {
-            DYNAMIC_BOOST_FULLTEXT, "*"
+            "mlt.fl=" + DYNAMIC_BOOST_FULLTEXT + "&mlt.mintf=1&mlt.qf=3",
+            "mlt.fl=*&mlt.mintf=2"
     };
 
     private static final String SIMILARITY_TAGS_BOOST = "similarityTagsBoost";
diff --git 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
index 9a3dee6ea8..7ef2959e39 100644
--- 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
+++ 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
@@ -182,7 +182,16 @@ public class ElasticRequestHandler {
                     // mlt?mlt.fl=:path&mlt.mindf=0&stream.body=<path> . We 
need parse this query
                     // string and turn into a query
                     // elastic can understand.
-                    bqb.must(m -> m.moreLikeThis(mltQuery(mltParams)));
+                    String fields = 
mltParams.remove(MoreLikeThisHelperUtil.MLT_FILED);
+                    if (fields == null || FieldNames.PATH.equals(fields)) {
+                        for (String stf : 
elasticIndexDefinition.getSimilarityTagsFields()) {
+                            Map<String, String> shallowMltParams = new 
HashMap<>(MoreLikeThisHelperUtil.getParamMapFromMltQuery(stf));
+                            shallowMltParams.putAll(mltParams);
+                            bqb.should(m -> 
m.moreLikeThis(mltQuery(shallowMltParams)));
+                        }
+                    } else {
+                        bqb.must(m -> m.moreLikeThis(mltQuery(mltParams)));
+                    }
                 } else {
                     bqb.must(m -> m.bool(similarityQuery(queryNodePath, sp)));
                 }
@@ -393,12 +402,7 @@ public class ElasticRequestHandler {
             // We store path as the _id so no need to do anything extra here
             // We expect Similar impl to send a query where text would have 
evaluated to
             // node path.
-            mlt.like(l -> l.document(d -> d.id(id)
-                    // this is needed to workaround 
https://github.com/elastic/elasticsearch/pull/94518 that causes empty
-                    // results when the _ignored metadata field is part of the 
input document
-                    .perFieldAnalyzer("_ignored", "keyword")));
-            // when no fields are specified, we set the ones from the index 
definition
-            
mlt.fields(Arrays.asList(elasticIndexDefinition.getSimilarityTagsFields()));
+            mlt.like(l -> l.document(d -> d.id(id)));
         } else {
             // This is for native queries if someone sends additional fields 
via
             // mlt.fl=field1,field2
diff --git 
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
 
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
index c787d1e9eb..e5b25dd063 100644
--- 
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
+++ 
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
@@ -52,7 +52,7 @@ public class ElasticIndexQueryCommonTest extends 
IndexQueryCommonTest {
         String query = "explain select [jcr:path] from [nt:base] where " +
                 "native('lucene', 
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')";
 
-        String explainWithoutSimilarityTags = 
"{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"must\":[{\"more_like_this\":{\"fields\":[\":dynamic-boost-ft\",\"*\"],\"include\":true,\"like\":[{\"_id\":\"/test/a\",\"per_field_analyzer\":{\"_ignored\":\"keyword\"}}],\"min_doc_freq\":0,\"min_term_freq\":0}}]}},\"size\":10,\"sort\":[{\"_score\":{\"order\":\"desc\"}},{\":path\":{\"order\":\"asc\"}}],\"track_total_hits\":10000}";
+        String explainWithoutSimilarityTags = 
"{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"should\":[{\"more_like_this\":{\"boost\":3.0,\"include\":true,\"like\":[{\"fields\":[\":dynamic-boost-ft\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}},{\"more_like_this\":{\"include\":true,\"like\":[{\"fields\":[\"*\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}}]}},\"size\":10,\"sort\":[{\"_score\":{\"order\":\"desc\"}},{\":path\":{\"order\"
 [...]
 
         Tree test = root.getTree("/").addChild("test");
         test.addChild("a").setProperty("text", "Hello World");
@@ -71,7 +71,7 @@ public class ElasticIndexQueryCommonTest extends 
IndexQueryCommonTest {
         // similarity tags enabled, but no similarity tags properties 
configured, should not be present in the explain output
         assertEventually(getAssertionForExplain(query, Query.JCR_SQL2, 
explainWithoutSimilarityTags, false));
 
-        String explainWithSimilarityTags = 
"{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"must\":[{\"more_like_this\":{\"fields\":[\":dynamic-boost-ft\",\"*\"],\"include\":true,\"like\":[{\"_id\":\"/test/a\",\"per_field_analyzer\":{\"_ignored\":\"keyword\"}}],\"min_doc_freq\":0,\"min_term_freq\":0}}],\"should\":[{\"more_like_this\":{\"boost\":0.5,\"fields\":[\":simTags\"],\"like\":[{\"_id\":\"/test/a\"}],\"min_doc_freq\":1,\"min_term_freq\":1}}]}},\"size\":10,\"sort\":[{
 [...]
+        String explainWithSimilarityTags = 
"{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"should\":[{\"more_like_this\":{\"boost\":3.0,\"include\":true,\"like\":[{\"fields\":[\":dynamic-boost-ft\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}},{\"more_like_this\":{\"include\":true,\"like\":[{\"fields\":[\"*\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}},{\"more_like_this\":{\"boost\":0.5,\"fields\":[\":simTags\"],\"like\":[{\"_id\":\"/
 [...]
         Tree properties = 
indexDefn.getChild(FulltextIndexConstants.INDEX_RULES).getChild("nt:base").getChild("properties");
         Tree simProp = TestUtil.enableForFullText(properties, "simProp", 
false);
         simProp.setProperty(FulltextIndexConstants.PROP_SIMILARITY_TAGS, true);
diff --git 
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
 
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
index b4a51d6c51..3055007bfa 100644
--- 
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
+++ 
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
@@ -131,8 +131,7 @@ public class ElasticSimilarQueryTest extends 
ElasticAbstractQueryTest {
     }
 
     /**
-     * Validates the workaround for <a 
href="https://github.com/elastic/elasticsearch/pull/94518";>94518</a> produces 
the
-     * expected results
+     * This test checks <a 
href="https://github.com/elastic/elasticsearch/pull/94518";>94518</a> issue.
      */
     @Test
     public void repSimilarQueryWithIgnoredMetadataField() throws Exception {
diff --git 
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java
 
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java
index 9cd2d5edbe..9fb4cc6053 100644
--- 
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java
+++ 
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java
@@ -175,6 +175,16 @@ public abstract class DynamicBoostCommonTest extends 
AbstractQueryTest {
         });
     }
 
+    @Test
+    public void dynamicBoostedTagsShouldShouldBeUsedInSimilarityQueries() 
throws Exception {
+        boolean lite = areAnalyzeFeaturesSupportedInLiteModeOnly();
+        createAssetsIndexAndProperties(lite, lite);
+        prepareTestAssets();
+
+        assertEventually(() -> assertOrderedQuery("select [jcr:path] from 
[dam:Asset] where similar(., '/test/asset1')",
+                List.of("/test/asset1", "/test/asset2", "/test/asset3")));
+    }
+
     protected abstract String getTestQueryDynamicBoostBasicExplained();
 
     protected boolean areAnalyzeFeaturesSupportedInLiteModeOnly() {

Reply via email to