This is an automated email from the ASF dual-hosted git repository.
fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new 931dfffec6 OAK-10762: dynamic boosted tags need to specify extra
parameters (#1416)
931dfffec6 is described below
commit 931dfffec63ee13bf403a4ad76587b5371669421
Author: Fabrizio Fortino <[email protected]>
AuthorDate: Tue Apr 23 08:28:13 2024 +0200
OAK-10762: dynamic boosted tags need to specify extra parameters (#1416)
* OAK-10762: dynamic boosted tags need to specify extra parameters
* OAK-10762: remove workaround for
https://github.com/elastic/elasticsearch/pull/94518
* OAK-10762: add todo on how to further improve relevance
* OAK-10762: fix failing tests
* OAK-10762: typo
* OAK-10762: improved todo comment
---
.../plugins/index/elastic/ElasticIndexDefinition.java | 9 ++++++---
.../index/elastic/query/ElasticRequestHandler.java | 18 +++++++++++-------
.../index/elastic/ElasticIndexQueryCommonTest.java | 4 ++--
.../plugins/index/elastic/ElasticSimilarQueryTest.java | 3 +--
.../oak/plugins/index/DynamicBoostCommonTest.java | 10 ++++++++++
5 files changed, 30 insertions(+), 14 deletions(-)
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
index 839e7d5559..844c8aa990 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
@@ -118,10 +118,13 @@ public class ElasticIndexDefinition extends
IndexDefinition {
// MLT queries, when no fields are specified, do not use the entire
document but only a maximum of
// max_query_terms (default 25). Even increasing this value, the query
could produce not so relevant
// results (eg: based on the :fulltext content). To work this around, we
can specify DYNAMIC_BOOST_FULLTEXT
- // field as first field since it usually contains relevant terms. This
will make sure that the MLT queries
- // give more priority to the terms in this field while the rest (*) are
considered secondary.
+ // field with overridden mlt params and increased boost since it usually
contains relevant terms. This will make sure
+ // that the MLT queries give more priority to the terms in this field
while the rest (*) are considered secondary.
+ // TODO: we can further improve search relevance by using the actual tags
combined with the weights using a function query.
+ // Right now, we are just matching the tags without looking at the
weights. Therefore, a tag can be matched in a field with a lower weight.
private static final String[] SIMILARITY_TAGS_FIELDS_DEFAULT = new
String[] {
- DYNAMIC_BOOST_FULLTEXT, "*"
+ "mlt.fl=" + DYNAMIC_BOOST_FULLTEXT + "&mlt.mintf=1&mlt.qf=3",
+ "mlt.fl=*&mlt.mintf=2"
};
private static final String SIMILARITY_TAGS_BOOST = "similarityTagsBoost";
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
index 9a3dee6ea8..7ef2959e39 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
@@ -182,7 +182,16 @@ public class ElasticRequestHandler {
// mlt?mlt.fl=:path&mlt.mindf=0&stream.body=<path> . We
need parse this query
// string and turn into a query
// elastic can understand.
- bqb.must(m -> m.moreLikeThis(mltQuery(mltParams)));
+ String fields =
mltParams.remove(MoreLikeThisHelperUtil.MLT_FILED);
+ if (fields == null || FieldNames.PATH.equals(fields)) {
+ for (String stf :
elasticIndexDefinition.getSimilarityTagsFields()) {
+ Map<String, String> shallowMltParams = new
HashMap<>(MoreLikeThisHelperUtil.getParamMapFromMltQuery(stf));
+ shallowMltParams.putAll(mltParams);
+ bqb.should(m ->
m.moreLikeThis(mltQuery(shallowMltParams)));
+ }
+ } else {
+ bqb.must(m -> m.moreLikeThis(mltQuery(mltParams)));
+ }
} else {
bqb.must(m -> m.bool(similarityQuery(queryNodePath, sp)));
}
@@ -393,12 +402,7 @@ public class ElasticRequestHandler {
// We store path as the _id so no need to do anything extra here
// We expect Similar impl to send a query where text would have
evaluated to
// node path.
- mlt.like(l -> l.document(d -> d.id(id)
- // this is needed to workaround
https://github.com/elastic/elasticsearch/pull/94518 that causes empty
- // results when the _ignored metadata field is part of the
input document
- .perFieldAnalyzer("_ignored", "keyword")));
- // when no fields are specified, we set the ones from the index
definition
-
mlt.fields(Arrays.asList(elasticIndexDefinition.getSimilarityTagsFields()));
+ mlt.like(l -> l.document(d -> d.id(id)));
} else {
// This is for native queries if someone sends additional fields
via
// mlt.fl=field1,field2
diff --git
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
index c787d1e9eb..e5b25dd063 100644
---
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
+++
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
@@ -52,7 +52,7 @@ public class ElasticIndexQueryCommonTest extends
IndexQueryCommonTest {
String query = "explain select [jcr:path] from [nt:base] where " +
"native('lucene',
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')";
- String explainWithoutSimilarityTags =
"{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"must\":[{\"more_like_this\":{\"fields\":[\":dynamic-boost-ft\",\"*\"],\"include\":true,\"like\":[{\"_id\":\"/test/a\",\"per_field_analyzer\":{\"_ignored\":\"keyword\"}}],\"min_doc_freq\":0,\"min_term_freq\":0}}]}},\"size\":10,\"sort\":[{\"_score\":{\"order\":\"desc\"}},{\":path\":{\"order\":\"asc\"}}],\"track_total_hits\":10000}";
+ String explainWithoutSimilarityTags =
"{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"should\":[{\"more_like_this\":{\"boost\":3.0,\"include\":true,\"like\":[{\"fields\":[\":dynamic-boost-ft\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}},{\"more_like_this\":{\"include\":true,\"like\":[{\"fields\":[\"*\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}}]}},\"size\":10,\"sort\":[{\"_score\":{\"order\":\"desc\"}},{\":path\":{\"order\"
[...]
Tree test = root.getTree("/").addChild("test");
test.addChild("a").setProperty("text", "Hello World");
@@ -71,7 +71,7 @@ public class ElasticIndexQueryCommonTest extends
IndexQueryCommonTest {
// similarity tags enabled, but no similarity tags properties
configured, should not be present in the explain output
assertEventually(getAssertionForExplain(query, Query.JCR_SQL2,
explainWithoutSimilarityTags, false));
- String explainWithSimilarityTags =
"{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"must\":[{\"more_like_this\":{\"fields\":[\":dynamic-boost-ft\",\"*\"],\"include\":true,\"like\":[{\"_id\":\"/test/a\",\"per_field_analyzer\":{\"_ignored\":\"keyword\"}}],\"min_doc_freq\":0,\"min_term_freq\":0}}],\"should\":[{\"more_like_this\":{\"boost\":0.5,\"fields\":[\":simTags\"],\"like\":[{\"_id\":\"/test/a\"}],\"min_doc_freq\":1,\"min_term_freq\":1}}]}},\"size\":10,\"sort\":[{
[...]
+ String explainWithSimilarityTags =
"{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"should\":[{\"more_like_this\":{\"boost\":3.0,\"include\":true,\"like\":[{\"fields\":[\":dynamic-boost-ft\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}},{\"more_like_this\":{\"include\":true,\"like\":[{\"fields\":[\"*\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}},{\"more_like_this\":{\"boost\":0.5,\"fields\":[\":simTags\"],\"like\":[{\"_id\":\"/
[...]
Tree properties =
indexDefn.getChild(FulltextIndexConstants.INDEX_RULES).getChild("nt:base").getChild("properties");
Tree simProp = TestUtil.enableForFullText(properties, "simProp",
false);
simProp.setProperty(FulltextIndexConstants.PROP_SIMILARITY_TAGS, true);
diff --git
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
index b4a51d6c51..3055007bfa 100644
---
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
+++
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
@@ -131,8 +131,7 @@ public class ElasticSimilarQueryTest extends
ElasticAbstractQueryTest {
}
/**
- * Validates the workaround for <a
href="https://github.com/elastic/elasticsearch/pull/94518">94518</a> produces
the
- * expected results
+ * This test checks <a
href="https://github.com/elastic/elasticsearch/pull/94518">94518</a> issue.
*/
@Test
public void repSimilarQueryWithIgnoredMetadataField() throws Exception {
diff --git
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java
index 9cd2d5edbe..9fb4cc6053 100644
---
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java
+++
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java
@@ -175,6 +175,16 @@ public abstract class DynamicBoostCommonTest extends
AbstractQueryTest {
});
}
+ @Test
+ public void dynamicBoostedTagsShouldShouldBeUsedInSimilarityQueries()
throws Exception {
+ boolean lite = areAnalyzeFeaturesSupportedInLiteModeOnly();
+ createAssetsIndexAndProperties(lite, lite);
+ prepareTestAssets();
+
+ assertEventually(() -> assertOrderedQuery("select [jcr:path] from
[dam:Asset] where similar(., '/test/asset1')",
+ List.of("/test/asset1", "/test/asset2", "/test/asset3")));
+ }
+
protected abstract String getTestQueryDynamicBoostBasicExplained();
protected boolean areAnalyzeFeaturesSupportedInLiteModeOnly() {