This is an automated email from the ASF dual-hosted git repository.
fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new 958e1ff69d OAK-10372: oak-search-elastic: similarity queries produce
no relevant results (#1046)
958e1ff69d is described below
commit 958e1ff69d0d22524dca0920c91f2605a447e38d
Author: Fabrizio Fortino <[email protected]>
AuthorDate: Fri Aug 4 16:45:21 2023 +0200
OAK-10372: oak-search-elastic: similarity queries produce no relevant
results (#1046)
* OAK-10372: oak-search-elastic: similarity queries produce no relevant
results
* OAK-10372: (minor) improved comments
* OAK-10372: make similarity fields configurable
---
.../index/elastic/ElasticIndexDefinition.java | 26 ++++++++++++++
.../index/elastic/query/ElasticRequestHandler.java | 8 ++++-
.../index/elastic/ElasticIndexQueryCommonTest.java | 42 ++++++++++++++++++++++
.../oak/plugins/index/IndexQueryCommonTest.java | 6 ++--
4 files changed, 78 insertions(+), 4 deletions(-)
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
index 61656c4296..70a975760b 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
@@ -104,6 +104,17 @@ public class ElasticIndexDefinition extends
IndexDefinition {
private static final String SIMILARITY_TAGS_ENABLED =
"similarityTagsEnabled";
private static final boolean SIMILARITY_TAGS_ENABLED_DEFAULT = true;
+ private static final String SIMILARITY_TAGS_FIELDS =
"similarityTagsFields";
+
+ // MLT queries, when no fields are specified, do not use the entire
document but only a maximum of
+ // max_query_terms (default 25). Even increasing this value, the query
could produce not so relevant
+ // results (eg: based on the :fulltext content). To work this around, we
can specify DYNAMIC_BOOST_FULLTEXT
+ // field as first field since it usually contains relevant terms. This
will make sure that the MLT queries
+ // give more priority to the terms in this field while the rest (*) are
considered secondary.
+ private static final String[] SIMILARITY_TAGS_FIELDS_DEFAULT = new
String[] {
+ DYNAMIC_BOOST_FULLTEXT, "*"
+ };
+
private static final String SIMILARITY_TAGS_BOOST = "similarityTagsBoost";
private static final float SIMILARITY_TAGS_BOOST_DEFAULT = 0.5f;
@@ -136,6 +147,8 @@ public class ElasticIndexDefinition extends IndexDefinition
{
private final Map<String, List<PropertyDefinition>> propertiesByName;
private final List<PropertyDefinition> dynamicBoostProperties;
private final List<PropertyDefinition> similarityProperties;
+ private final List<PropertyDefinition> similarityTagsProperties;
+ private final String[] similarityTagsFields;
public ElasticIndexDefinition(NodeState root, NodeState defn, String
indexPath, String indexPrefix) {
super(root, defn, determineIndexFormatVersion(defn),
determineUniqueId(defn), indexPath);
@@ -157,6 +170,7 @@ public class ElasticIndexDefinition extends IndexDefinition
{
this.failOnError = getOptionalValue(defn, FAIL_ON_ERROR,
Boolean.parseBoolean(System.getProperty(TYPE_ELASTICSEARCH +
"." + FAIL_ON_ERROR, Boolean.toString(FAIL_ON_ERROR_DEFAULT)))
);
+ this.similarityTagsFields = getOptionalValues(defn,
SIMILARITY_TAGS_FIELDS, Type.STRINGS, String.class,
SIMILARITY_TAGS_FIELDS_DEFAULT);
this.propertiesByName = getDefinedRules()
.stream()
@@ -181,6 +195,10 @@ public class ElasticIndexDefinition extends
IndexDefinition {
.stream()
.flatMap(rule -> rule.getSimilarityProperties().stream())
.collect(Collectors.toList());
+
+ this.similarityTagsProperties = propertiesByName.values().stream()
+ .flatMap(List::stream)
+ .filter(pd -> pd.similarityTags).collect(Collectors.toList());
}
@Nullable
@@ -213,6 +231,14 @@ public class ElasticIndexDefinition extends
IndexDefinition {
return similarityProperties;
}
+ public List<PropertyDefinition> getSimilarityTagsProperties() {
+ return similarityTagsProperties;
+ }
+
+ public String[] getSimilarityTagsFields() {
+ return similarityTagsFields;
+ }
+
public boolean areSimilarityTagsEnabled() {
return similarityTagsEnabled;
}
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
index f64583f5d8..9c6c777345 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
@@ -36,6 +36,7 @@ import java.io.IOException;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Base64;
import java.util.HashMap;
import java.util.List;
@@ -183,7 +184,10 @@ public class ElasticRequestHandler {
fnb.must(m ->
m.bool(similarityQuery(queryNodePath, sp)));
}
- if
(elasticIndexDefinition.areSimilarityTagsEnabled()) {
+ // Add should clause to improve relevance
using similarity tags only when similarity is
+ // enabled and there is at least one
similarity tag property
+ if
(elasticIndexDefinition.areSimilarityTagsEnabled() &&
+
!elasticIndexDefinition.getSimilarityTagsProperties().isEmpty()) {
// add should clause to improve relevance
using similarity tags
fnb.should(s -> s
.moreLikeThis(m -> m
@@ -393,6 +397,8 @@ public class ElasticRequestHandler {
// this is needed to workaround
https://github.com/elastic/elasticsearch/pull/94518 that causes empty
// results when the _ignored metadata field is part of the
input document
.perFieldAnalyzer("_ignored", "keyword")));
+ // when no fields are specified, we set the ones from the index
definition
+
mlt.fields(Arrays.asList(elasticIndexDefinition.getSimilarityTagsFields()));
} else {
// This is for native queries if someone sends additional fields
via
// mlt.fl=field1,field2
diff --git
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
index 5f05e98ae3..02409cfb40 100644
---
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
+++
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java
@@ -17,10 +17,17 @@
package org.apache.jackrabbit.oak.plugins.index.elastic;
import org.apache.jackrabbit.oak.InitialContentHelper;
+import org.apache.jackrabbit.oak.api.CommitFailedException;
import org.apache.jackrabbit.oak.api.ContentRepository;
+import org.apache.jackrabbit.oak.api.Tree;
import org.apache.jackrabbit.oak.plugins.index.IndexQueryCommonTest;
+import org.apache.jackrabbit.oak.plugins.index.TestUtil;
+import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
import org.apache.jackrabbit.oak.plugins.memory.MemoryNodeStore;
import org.junit.ClassRule;
+import org.junit.Test;
+
+import javax.jcr.query.Query;
public class ElasticIndexQueryCommonTest extends IndexQueryCommonTest {
@@ -40,6 +47,41 @@ public class ElasticIndexQueryCommonTest extends
IndexQueryCommonTest {
return repositoryOptionsUtil.getOak().createContentRepository();
}
+ @Test
+ public void similarityQueryShouldCorrectlyHandleSimilarityTags() throws
CommitFailedException {
+ String query = "explain select [jcr:path] from [nt:base] where " +
+ "native('lucene',
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')";
+
+ String explainWithoutSimilarityTags = "[nt:base] as [nt:base] /*
elasticsearch:test-index(/oak:index/test-index)
{\"bool\":{\"must\":[{\"more_like_this\":{\"fields\":[\":dynamic-boost-ft\",\"*\"],\"include\":true,\"like\":[{\"_id\":\"/test/a\",\"per_field_analyzer\":{\"_ignored\":\"keyword\"}}],\"min_doc_freq\":0,\"min_term_freq\":0}}]}}"
+
+ " where native([nt:base], [lucene],
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0') */";
+
+ Tree test = root.getTree("/").addChild("test");
+ test.addChild("a").setProperty("text", "Hello World");
+ test.addChild("b").setProperty("text", "He said Hello and then the
world said Hello as well.");
+ test.addChild("c").setProperty("text", "He said Hi.");
+
+ indexDefn.setProperty("similarityTagsEnabled", false);
+ root.commit();
+
+ // similarity tags disabled, should not be present in the explain
+ assertEventually(getAssertionForExplain(query, Query.JCR_SQL2,
explainWithoutSimilarityTags, true));
+
+ indexDefn.setProperty("similarityTagsEnabled", true);
+ root.commit();
+
+ // similarity tags enabled, but no similarity tags properties
configured, should not be present in the explain
+ assertEventually(getAssertionForExplain(query, Query.JCR_SQL2,
explainWithoutSimilarityTags, true));
+
+ String explainWithSimilarityTags = "[nt:base] as [nt:base] /*
elasticsearch:test-index(/oak:index/test-index)
{\"bool\":{\"must\":[{\"more_like_this\":{\"fields\":[\":dynamic-boost-ft\",\"*\"],\"include\":true,\"like\":[{\"_id\":\"/test/a\",\"per_field_analyzer\":{\"_ignored\":\"keyword\"}}],\"min_doc_freq\":0,\"min_term_freq\":0}}],\"should\":[{\"more_like_this\":{\"boost\":0.5,\"fields\":[\":simTags\"],\"like\":[{\"_id\":\"/test/a\"}],\"min_doc_freq\":1,\"min_term_freq\":1}}]}}"
+
+ " where native([nt:base], [lucene],
'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0') */";
+ Tree properties =
indexDefn.getChild(FulltextIndexConstants.INDEX_RULES).getChild("nt:base").getChild("properties");
+ Tree simProp = TestUtil.enableForFullText(properties, "simProp",
false);
+ simProp.setProperty(FulltextIndexConstants.PROP_SIMILARITY_TAGS, true);
+ root.commit();
+
+ assertEventually(getAssertionForExplain(query, Query.JCR_SQL2,
explainWithSimilarityTags, true));
+ }
+
@Override
public String getContainsValueForEqualityQuery_native() {
return
"\"filter\":[{\"term\":{\":ancestors\":{\"value\":\"/test\"}}},{\"term\":{\"propa.keyword\":{\"value\":\"bar\"}}}]";
diff --git
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/IndexQueryCommonTest.java
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/IndexQueryCommonTest.java
index ec9009fa46..fa62cbb4cc 100644
---
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/IndexQueryCommonTest.java
+++
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/IndexQueryCommonTest.java
@@ -57,7 +57,7 @@ import static org.junit.Assert.fail;
*/
public abstract class IndexQueryCommonTest extends AbstractQueryTest {
- private Tree indexDefn;
+ protected Tree indexDefn;
protected IndexOptions indexOptions;
protected TestRepository repositoryOptionsUtil;
private LogCustomizer logCustomizer;
@@ -813,7 +813,7 @@ public abstract class IndexQueryCommonTest extends
AbstractQueryTest {
public abstract String
getExplainValueForDescendantTestWithIndexTagExplain();
- private Runnable getAssertionForExplain(String query, String language,
String expected, boolean matchComplete) {
+ protected Runnable getAssertionForExplain(String query, String language,
String expected, boolean matchComplete) {
return () -> {
Result result = null;
try {
@@ -830,7 +830,7 @@ public abstract class IndexQueryCommonTest extends
AbstractQueryTest {
};
}
- private static void assertEventually(Runnable r) {
+ protected static void assertEventually(Runnable r) {
TestUtil.assertEventually(r, 3000 * 3);
}
}