This is an automated email from the ASF dual-hosted git repository.

fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 360421736e OAK-12053 | oak-search-elastic: set max analyzed offset for 
highlights (#2681)
360421736e is described below

commit 360421736e20ea7218d7c92e09912b9638fb1d90
Author: Fabrizio Fortino <[email protected]>
AuthorDate: Wed Jan 14 11:11:10 2026 +0100

    OAK-12053 | oak-search-elastic: set max analyzed offset for highlights 
(#2681)
    
    * OAK-12053: set max analyzed offset for highlights to avoid query failures 
for large fields
    
    * OAK-12053: minor improvements
    
    * OAK-12053: minor improvements
    
    * OAK-12053: minor improvements
    
    * OAK-12053: minor improvements
    
    * OAK-12053: add TODO to improve logic when upgrading to v9.x
---
 .../index/elastic/query/ElasticRequestHandler.java | 14 ++++++++--
 .../jackrabbit/oak/plugins/index/ExcerptTest.java  | 30 +++++++++++++++++++---
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
index 640b48a3d5..6b8d8ab44e 100644
--- 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
+++ 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
@@ -82,7 +82,6 @@ import org.slf4j.LoggerFactory;
 
 import javax.jcr.PropertyType;
 import java.io.IOException;
-import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -131,6 +130,11 @@ public class ElasticRequestHandler {
 
     private static final String HIGHLIGHT_PREFIX = "<strong>";
     private static final String HIGHLIGHT_SUFFIX = "</strong>";
+    // by default, highlight analyzes up to 1M characters. If the content is 
larger than that, an error is thrown.
+    // To avoid that we need to set a limit lower than that.
+    // TODO: when upgrading to 9.x this value can be set to -1 to implicitly 
set the limit to index.higihlight.max_analyzed_offset
+    // https://github.com/elastic/elasticsearch/pull/118895
+    private static final int HIGHLIGHT_MAX_ANALYZED_OFFSET = 999_999;
 
     // Match Lucene 4.x fuzzy queries (e.g., roam~0.8), but not 5.x and beyond 
(e.g., roam~2)
     private static final Pattern LUCENE_4_FUZZY_PATTERN = 
Pattern.compile("\\b(\\w+)~([0-9]*\\.?[0-9]+)\\b");
@@ -941,13 +945,18 @@ public class ElasticRequestHandler {
      * @return a Highlight object representing the excerpts to request or null 
if none should be requested
      */
     public Highlight highlight() {
+        // if the query does not have a full text constraint, the excerpt 
makes no sense (it will always be empty)
+        if (indexPlan.getFilter().getFullTextConstraint() == null) {
+            return null;
+        }
+
         Map<String, HighlightField> excerpts = 
indexPlan.getFilter().getPropertyRestrictions().stream()
                 .filter(pr -> 
pr.propertyName.startsWith(QueryConstants.REP_EXCERPT))
                 .map(this::excerptField)
                 .distinct()
                 .collect(Collectors.toMap(
                         Function.identity(),
-                        field -> HighlightField.of(hf -> hf.withJson(new 
StringReader("{}"))))
+                        field -> HighlightField.of(hf -> hf))
                 );
 
         if (excerpts.isEmpty()) {
@@ -958,6 +967,7 @@ public class ElasticRequestHandler {
                 .preTags(HIGHLIGHT_PREFIX)
                 .postTags(HIGHLIGHT_SUFFIX)
                 .fields(excerpts)
+                .maxAnalyzedOffset(HIGHLIGHT_MAX_ANALYZED_OFFSET)
                 .numberOfFragments(1)
                 .requireFieldMatch(false));
     }
diff --git 
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/ExcerptTest.java
 
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/ExcerptTest.java
index a265a70c66..71e713251b 100644
--- 
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/ExcerptTest.java
+++ 
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/ExcerptTest.java
@@ -18,6 +18,7 @@
  */
 package org.apache.jackrabbit.oak.plugins.index;
 
+import org.apache.commons.lang3.RandomStringUtils;
 import org.apache.jackrabbit.JcrConstants;
 import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.PropertyValue;
@@ -29,13 +30,12 @@ import 
org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
 import org.apache.jackrabbit.oak.plugins.index.search.IndexFormatVersion;
 import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
 import org.apache.jackrabbit.oak.query.AbstractQueryTest;
+import org.apache.jackrabbit.oak.spi.filter.PathFilter;
 import org.junit.Before;
 import org.junit.Ignore;
 import org.junit.Test;
 
 import java.text.ParseException;
-import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.stream.Collectors;
@@ -73,6 +73,7 @@ public abstract class ExcerptTest extends AbstractQueryTest {
         def.setProperty(REINDEX_PROPERTY_NAME, true);
         def.setProperty(FulltextIndexConstants.EVALUATE_PATH_RESTRICTION, 
true);
         def.setProperty(FulltextIndexConstants.COMPAT_MODE, 
IndexFormatVersion.V2.getVersion());
+        def.setProperty(PathFilter.PROP_INCLUDED_PATHS, List.of("/testRoot"), 
Type.STRINGS);
 
         Tree properties = def.addChild(FulltextIndexConstants.INDEX_RULES)
                 .addChild("nt:base")
@@ -104,7 +105,7 @@ public abstract class ExcerptTest extends AbstractQueryTest 
{
         contentRoot.setProperty("baz", "fox ifoxing");
         root.commit();
 
-        List<String> columns = new ArrayList<>(Arrays.asList("rep:excerpt", 
"rep:excerpt(.)", "rep:excerpt(foo)", "rep:excerpt(bar)"));
+        List<String> columns = List.of("rep:excerpt", "rep:excerpt(.)", 
"rep:excerpt(foo)", "rep:excerpt(bar)");
         String selectColumns = columns.stream().map(col -> "[" + col + 
"]").collect(Collectors.joining(","));
         String query = "SELECT " + selectColumns + " FROM [nt:base] WHERE 
CONTAINS(*, 'fox')";
         assertEventually(() -> {
@@ -320,4 +321,27 @@ public abstract class ExcerptTest extends 
AbstractQueryTest {
             }
         });
     }
+
+    @Test
+    public void excerptOnLargeField() throws Exception {
+        Tree contentRoot = root.getTree("/").addChild("testRoot");
+        StringBuilder largeContent = new StringBuilder("fox ");
+        for (int i = 0; i < 1_000_000; i++) {
+            
largeContent.append(RandomStringUtils.insecure().nextAlphabetic(5)).append(" ");
+        }
+        largeContent.append(" foxing");
+        contentRoot.addChild("relative").setProperty("baz", 
largeContent.toString());
+        root.commit();
+
+        String query = "SELECT [rep:excerpt(.)] FROM [nt:base] WHERE 
CONTAINS(*, 'fox')";
+        assertEventually(() -> {
+            try {
+                Result result = executeQuery(query, SQL2, NO_BINDINGS);
+                Iterator<? extends ResultRow> resultIter = 
result.getRows().iterator();
+                assertTrue(resultIter.hasNext());
+            } catch (ParseException e) {
+                fail(e.getMessage());
+            }
+        });
+    }
 }

Reply via email to