(jackrabbit-oak) 01/01: OAK-11743: Filters only work inside knn query, so filters should be moved to knn query also

mkataria Thu, 29 May 2025 00:06:00 -0700

This is an automated email from the ASF dual-hosted git repository.

mkataria pushed a commit to branch OAK-11743
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git


commit 99bb2cbdfd519191cf22329f2644a682c1e9ee04
Author: Mohit Kataria <[email protected]>
AuthorDate: Thu May 29 12:16:32 2025 +0530

    OAK-11743: Filters only work inside knn query, so filters should be moved 
to knn query also
---
 .../index/elastic/query/ElasticRequestHandler.java |   8 +-
 .../inference/ElasticInferenceUsingConfigTest.java | 185 ++++++++++++++++++++-
 2 files changed, 188 insertions(+), 5 deletions(-)

diff --git 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
index 220935327e..2411079170 100644
--- 
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
+++ 
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
@@ -694,7 +694,13 @@ public class ElasticRequestHandler {
                         knnQueryBuilder.field(InferenceConstants.VECTOR_SPACES 
+ "." + inferenceModelConfigName + "." + InferenceConstants.VECTOR);
                         
knnQueryBuilder.numCandidates(inferenceModelConfig.getNumCandidates());
                         knnQueryBuilder.queryVector(embeddings);
-
+                        // filters in knn are only applicable if filters are 
defined in knn query itself.
+                        // the filters outside knn query are applicable as 
post filters which can lead to missing results.
+                        if (planResult.evaluateNonFullTextConstraints()) {
+                            for (Query constraint : 
nonFullTextConstraints(indexPlan, planResult)) {
+                                knnQueryBuilder.filter(constraint);
+                            }
+                        }
                         KnnQuery knnQuery = knnQueryBuilder.build();
 
                         NestedQuery.Builder nestedQueryBuilder = new 
NestedQuery.Builder()
diff --git 
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/inference/ElasticInferenceUsingConfigTest.java
 
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/inference/ElasticInferenceUsingConfigTest.java
index 16ca5b8f74..bc81b6c1c9 100644
--- 
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/inference/ElasticInferenceUsingConfigTest.java
+++ 
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/inference/ElasticInferenceUsingConfigTest.java
@@ -17,6 +17,9 @@
 package org.apache.jackrabbit.oak.plugins.index.elastic.query.inference;
 
 import ch.qos.logback.classic.Level;
+import co.elastic.clients.elasticsearch._types.ElasticsearchException;
+import co.elastic.clients.elasticsearch.core.SearchRequest;
+import co.elastic.clients.elasticsearch.core.SearchResponse;
 import co.elastic.clients.elasticsearch.indices.get_mapping.IndexMappingRecord;
 import co.elastic.clients.json.JsonData;
 import com.fasterxml.jackson.core.JsonProcessingException;
@@ -36,6 +39,7 @@ import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.commons.PathUtils;
 import org.apache.jackrabbit.oak.commons.junit.LogCustomizer;
 import 
org.apache.jackrabbit.oak.plugins.index.elastic.ElasticAbstractQueryTest;
+import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
 import 
org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
 import org.apache.jackrabbit.oak.spi.commit.CommitInfo;
 import org.apache.jackrabbit.oak.spi.commit.EmptyHook;
@@ -45,7 +49,6 @@ import org.apache.jackrabbit.oak.stats.CounterStats;
 import org.apache.jackrabbit.oak.stats.DefaultStatisticsProvider;
 import org.apache.jackrabbit.oak.stats.StatisticsProvider;
 import org.apache.jackrabbit.oak.stats.StatsOptions;
-import org.jetbrains.annotations.NotNull;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Ignore;
@@ -355,9 +358,20 @@ public class ElasticInferenceUsingConfigTest extends 
ElasticAbstractQueryTest {
 
     /**
      * Adds test content for the hybrid search test.
+     * default path for node creation is under /content
      */
     private void addTestContent() throws CommitFailedException {
-        Tree content = root.getTree("/").addChild("content");
+        addTestContent("/content");
+    }
+
+    /**
+     * Adds test content on a specified path.
+     */
+    private void addTestContent(String path) throws CommitFailedException {
+        Tree content = root.getTree("/");
+        for (String element : PathUtils.elements(path)) {
+            content = content.addChild(element);
+        }
 
         // Health content
         Tree health = content.addChild("health");
@@ -400,7 +414,8 @@ public class ElasticInferenceUsingConfigTest extends 
ElasticAbstractQueryTest {
         List<String> paths = executeQuery("select [jcr:path] from [nt:base] 
where ISDESCENDANTNODE('/content') and title is not null", SQL2);
 
         for (String path : paths) {
-            URL json = this.getClass().getResource("/inferenceUsingConfig" + 
path + ".json");
+            String docName = PathUtils.getName(path);
+            URL json = 
this.getClass().getResource("/inferenceUsingConfig/content/" + docName + 
".json");
             if (json != null) {
                 Map<String, Collection<Double>> map = mapper.readValue(json, 
Map.class);
                 ObjectNode updateDoc = mapper.createObjectNode();
@@ -492,7 +507,7 @@ public class ElasticInferenceUsingConfigTest extends 
ElasticAbstractQueryTest {
     private void verifyErrorHandling(String jcrIndexName, String 
inferenceConfigInQuery) {
         // Test server error handling
         String queryPath3 = "select [jcr:path] from [nt:base] where 
ISDESCENDANTNODE('/content') and contains(*, '"
-            + inferenceConfigInQuery  + "machine learning')";
+            + inferenceConfigInQuery + "machine learning')";
         assertQuery(queryPath3, List.of("/content/ml", 
"/content/programming"));
 
         // Test timeout handling
@@ -801,4 +816,166 @@ public class ElasticInferenceUsingConfigTest extends 
ElasticAbstractQueryTest {
             return uniquePrefix + "_" + baseName;
         }
     }
+
+    /**
+     * Count documents under a specific path
+     */
+    private int countDocuments(Tree index, String path) {
+        try {
+            return (int) getIndexedPaths(index).stream()
+                .filter(indexPath -> indexPath.startsWith(path))
+                .count();
+        } catch (IOException e) {
+            LOG.error("Error counting documents", e);
+            return 0;
+        }
+    }
+
+    /**
+     * Get all paths indexed in the given index
+     */
+    private List<String> getIndexedPaths(Tree index) throws IOException {
+        ElasticIndexDefinition esIdxDef = getElasticIndexDefinition(index);
+
+        try {
+            // Query to get all documents
+            SearchRequest searchRequest = SearchRequest.of(r -> r
+                .index(esIdxDef.getIndexAlias())
+                .size(10000)  // Set a high limit, adjust if needed
+                .query(q -> q.matchAll(m -> m))
+            );
+
+            SearchResponse<JsonData> response = 
esConnection.getClient().search(searchRequest, JsonData.class);
+
+            // Extract paths from all hits
+            return response.hits().hits().stream()
+                .map(hit -> hit.id())
+                .collect(Collectors.toList());
+        } catch (ElasticsearchException e) {
+            throw new IOException("Error getting indexed paths", e);
+        }
+    }
+
+    /**
+     * Tests KNN search functionality with a large number of documents.
+     * This test verifies that vector search works correctly with filters when 
the
+     * number of documents exceeds the default KNN result limit.
+     */
+    @Test
+    public void testHugeIngestionForKNNFilters() throws Exception {
+        // Setup test parameters
+        String jcrIndexName = UUID.randomUUID().toString();
+        String inferenceServiceUrl = "http://localhost:"; + wireMock.port() + 
"/v1/embeddings";
+        String inferenceModelConfigName = "ada-test-model";
+        String inferenceModelName = "text-embedding-ada-002";
+        String inferenceConfigInQuery = "?{}?";
+
+        // Create inference configuration
+        createInferenceConfig(jcrIndexName, true, defaultEnricherConfig, 
inferenceModelConfigName,
+            inferenceModelName, inferenceServiceUrl, 0.8, 1L, true, true);
+        setupEnricherStatus(defaultEnricherStatusMapping, 
defaultEnricherStatusData);
+
+        // Create index definition with searchable properties
+        IndexDefinitionBuilder builder = createIndexDefinition("title", 
"description", "updatedBy");
+        Tree index = setIndex(jcrIndexName, builder);
+        root.commit();
+
+        // Setup mock inference service
+        setupMockInferenceService(inferenceModelConfigName, jcrIndexName);
+
+        // Create and index test content
+        LOG.info("Starting large-scale document ingestion test for KNN 
search");
+
+        // Add regular test documents to be searched against these will be 
used when searching with filters
+        addTestContent("/content/filterPath");
+
+        // Remove the cars node as we'll be searching for content similar to 
"cars" query
+        // but need to verify we get the next best result (ML content)
+        
root.getTree("/").getChild("content").getChild("filterPath").getChild("cars").remove();
+        root.commit();
+
+        // Let the index catch up with initial content
+        assertEventually(() -> assertEquals(7, countDocuments(index)));
+
+        // Enrich the initial test documents with embeddings
+        setupEmbeddingsForContent(index, inferenceModelConfigName, 
inferenceModelName);
+
+        // Create a large number of documents with car-related content
+        Tree hugeIngestion = 
root.getTree("/").addChild("content").addChild("hugeIngestion");
+        root.commit();
+
+        // Default KNN result limit is 100, so we create more documents to 
test post-filtering
+        int numberOfDocuments = 200;
+
+        // Load the embeddings from cars.json to reuse for all test documents
+        URL jsonUrl = 
this.getClass().getResource("/inferenceUsingConfig/content/cars.json");
+        ObjectMapper mapper = new JsonMapper();
+        Map<String, Collection<Double>> embeddingsMap = 
mapper.readValue(jsonUrl, Map.class);
+        List<Float> embeddings = embeddingsMap.get("embedding").stream()
+            .map(d -> ((Double) d).floatValue())
+            .collect(Collectors.toList());
+
+        // Create a large number of car-related documents
+        LOG.info("Creating {} car-related documents", numberOfDocuments);
+        for (int i = 0; i < numberOfDocuments; i++) {
+            String nodeName = "cars_" + i;
+            Tree document = hugeIngestion.addChild(nodeName);
+            document.setProperty("title", "The Future of Electric Cars " + i);
+            document.setProperty("description",
+                "Electric vehicles are revolutionizing the automobile 
industry. Document " + i +
+                    " explores advancements in battery technology, charging 
infrastructure, and sustainability.");
+        }
+        root.commit();
+
+        // Wait for initial indexing to complete
+        LOG.info("Waiting for initial indexing to complete");
+        assertEventually(() -> {
+            int docCount = countDocuments(index, "/content/hugeIngestion");
+            LOG.info("Current document count: {}", docCount);
+            assertTrue("Expected at least " + numberOfDocuments + " documents, 
found " + docCount,
+                docCount >= numberOfDocuments);
+        });
+
+        // Add vector embeddings to all documents
+        LOG.info("Adding vector embeddings to all documents");
+        for (int i = 0; i < numberOfDocuments; i++) {
+            String path = "/content/hugeIngestion/cars_" + i;
+            VectorDocument vectorDocument = new VectorDocument(
+                UUID.randomUUID().toString(),
+                embeddings,
+                Map.of("updatedAt", Instant.now().toEpochMilli(), "model", 
inferenceModelName)
+            );
+
+            ObjectNode updateDoc = mapper.createObjectNode();
+            ObjectNode vectorSpacesNode = 
updateDoc.putObject(InferenceConstants.VECTOR_SPACES);
+            ArrayNode inferenceModelConfigNode = 
vectorSpacesNode.putArray(inferenceModelConfigName);
+            inferenceModelConfigNode.addPOJO(vectorDocument);
+
+            updateDocument(index, path, updateDoc);
+
+            // Log progress periodically
+            if (i % 50 == 0) {
+                LOG.info("Added embeddings to {} documents", i);
+            }
+        }
+
+        LOG.info("All documents have been ingested with embeddings");
+
+        // Test vector search query that should match car-related content
+        // but only return results from the filterPath (not from hugeIngestion)
+        String searchQuery = "technological advancements in electric vehicles";
+        String queryPath = "select [jcr:path] from [nt:base] where 
ISDESCENDANTNODE('/content/filterPath') and contains(*, '"
+            + inferenceConfigInQuery + searchQuery + "')";
+
+        // Execute query and verify results
+        LOG.info("Executing vector search query with path filter: {}", 
queryPath);
+        assertEventually(() -> {
+            List<String> results = executeQuery(queryPath, SQL2, true, true);
+
+            // Since we removed the cars node, we should still get ML content 
as the next best match
+            assertFalse("Should have returned at least one result", 
results.isEmpty());
+            LOG.info("Search returned {} results with machine learning 
content", results.size());
+            assertEquals("/content/filterPath/ml", results.get(0));
+        });
+    }
 }

(jackrabbit-oak) 01/01: OAK-11743: Filters only work inside knn query, so filters should be moved to knn query also

Reply via email to