This is an automated email from the ASF dual-hosted git repository.
fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new 360421736e OAK-12053 | oak-search-elastic: set max analyzed offset for
highlights (#2681)
360421736e is described below
commit 360421736e20ea7218d7c92e09912b9638fb1d90
Author: Fabrizio Fortino <[email protected]>
AuthorDate: Wed Jan 14 11:11:10 2026 +0100
OAK-12053 | oak-search-elastic: set max analyzed offset for highlights
(#2681)
* OAK-12053: set max analyzed offset for highlights to avoid query failures
for large fields
* OAK-12053: minor improvements
* OAK-12053: minor improvements
* OAK-12053: minor improvements
* OAK-12053: minor improvements
* OAK-12053: add TODO to improve logic when upgrading to v9.x
---
.../index/elastic/query/ElasticRequestHandler.java | 14 ++++++++--
.../jackrabbit/oak/plugins/index/ExcerptTest.java | 30 +++++++++++++++++++---
2 files changed, 39 insertions(+), 5 deletions(-)
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
index 640b48a3d5..6b8d8ab44e 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
@@ -82,7 +82,6 @@ import org.slf4j.LoggerFactory;
import javax.jcr.PropertyType;
import java.io.IOException;
-import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -131,6 +130,11 @@ public class ElasticRequestHandler {
private static final String HIGHLIGHT_PREFIX = "<strong>";
private static final String HIGHLIGHT_SUFFIX = "</strong>";
+ // by default, highlight analyzes up to 1M characters. If the content is
larger than that, an error is thrown.
+ // To avoid that we need to set a limit lower than that.
+ // TODO: when upgrading to 9.x this value can be set to -1 to implicitly
set the limit to index.higihlight.max_analyzed_offset
+ // https://github.com/elastic/elasticsearch/pull/118895
+ private static final int HIGHLIGHT_MAX_ANALYZED_OFFSET = 999_999;
// Match Lucene 4.x fuzzy queries (e.g., roam~0.8), but not 5.x and beyond
(e.g., roam~2)
private static final Pattern LUCENE_4_FUZZY_PATTERN =
Pattern.compile("\\b(\\w+)~([0-9]*\\.?[0-9]+)\\b");
@@ -941,13 +945,18 @@ public class ElasticRequestHandler {
* @return a Highlight object representing the excerpts to request or null
if none should be requested
*/
public Highlight highlight() {
+ // if the query does not have a full text constraint, the excerpt
makes no sense (it will always be empty)
+ if (indexPlan.getFilter().getFullTextConstraint() == null) {
+ return null;
+ }
+
Map<String, HighlightField> excerpts =
indexPlan.getFilter().getPropertyRestrictions().stream()
.filter(pr ->
pr.propertyName.startsWith(QueryConstants.REP_EXCERPT))
.map(this::excerptField)
.distinct()
.collect(Collectors.toMap(
Function.identity(),
- field -> HighlightField.of(hf -> hf.withJson(new
StringReader("{}"))))
+ field -> HighlightField.of(hf -> hf))
);
if (excerpts.isEmpty()) {
@@ -958,6 +967,7 @@ public class ElasticRequestHandler {
.preTags(HIGHLIGHT_PREFIX)
.postTags(HIGHLIGHT_SUFFIX)
.fields(excerpts)
+ .maxAnalyzedOffset(HIGHLIGHT_MAX_ANALYZED_OFFSET)
.numberOfFragments(1)
.requireFieldMatch(false));
}
diff --git
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/ExcerptTest.java
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/ExcerptTest.java
index a265a70c66..71e713251b 100644
---
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/ExcerptTest.java
+++
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/ExcerptTest.java
@@ -18,6 +18,7 @@
*/
package org.apache.jackrabbit.oak.plugins.index;
+import org.apache.commons.lang3.RandomStringUtils;
import org.apache.jackrabbit.JcrConstants;
import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.PropertyValue;
@@ -29,13 +30,12 @@ import
org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
import org.apache.jackrabbit.oak.plugins.index.search.IndexFormatVersion;
import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
import org.apache.jackrabbit.oak.query.AbstractQueryTest;
+import org.apache.jackrabbit.oak.spi.filter.PathFilter;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import java.text.ParseException;
-import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
@@ -73,6 +73,7 @@ public abstract class ExcerptTest extends AbstractQueryTest {
def.setProperty(REINDEX_PROPERTY_NAME, true);
def.setProperty(FulltextIndexConstants.EVALUATE_PATH_RESTRICTION,
true);
def.setProperty(FulltextIndexConstants.COMPAT_MODE,
IndexFormatVersion.V2.getVersion());
+ def.setProperty(PathFilter.PROP_INCLUDED_PATHS, List.of("/testRoot"),
Type.STRINGS);
Tree properties = def.addChild(FulltextIndexConstants.INDEX_RULES)
.addChild("nt:base")
@@ -104,7 +105,7 @@ public abstract class ExcerptTest extends AbstractQueryTest
{
contentRoot.setProperty("baz", "fox ifoxing");
root.commit();
- List<String> columns = new ArrayList<>(Arrays.asList("rep:excerpt",
"rep:excerpt(.)", "rep:excerpt(foo)", "rep:excerpt(bar)"));
+ List<String> columns = List.of("rep:excerpt", "rep:excerpt(.)",
"rep:excerpt(foo)", "rep:excerpt(bar)");
String selectColumns = columns.stream().map(col -> "[" + col +
"]").collect(Collectors.joining(","));
String query = "SELECT " + selectColumns + " FROM [nt:base] WHERE
CONTAINS(*, 'fox')";
assertEventually(() -> {
@@ -320,4 +321,27 @@ public abstract class ExcerptTest extends
AbstractQueryTest {
}
});
}
+
+ @Test
+ public void excerptOnLargeField() throws Exception {
+ Tree contentRoot = root.getTree("/").addChild("testRoot");
+ StringBuilder largeContent = new StringBuilder("fox ");
+ for (int i = 0; i < 1_000_000; i++) {
+
largeContent.append(RandomStringUtils.insecure().nextAlphabetic(5)).append(" ");
+ }
+ largeContent.append(" foxing");
+ contentRoot.addChild("relative").setProperty("baz",
largeContent.toString());
+ root.commit();
+
+ String query = "SELECT [rep:excerpt(.)] FROM [nt:base] WHERE
CONTAINS(*, 'fox')";
+ assertEventually(() -> {
+ try {
+ Result result = executeQuery(query, SQL2, NO_BINDINGS);
+ Iterator<? extends ResultRow> resultIter =
result.getRows().iterator();
+ assertTrue(resultIter.hasNext());
+ } catch (ParseException e) {
+ fail(e.getMessage());
+ }
+ });
+ }
}