This is an automated email from the ASF dual-hosted git repository. mkataria pushed a commit to branch OAK-11757 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit f9ea15e3037c207318b999e1e3b3d5d7809c2c59 Author: Mohit Kataria <tiho...@gmail.com> AuthorDate: Thu Jun 12 10:32:26 2025 +0530 OAK-11757: Implement similarityThreshold for inference --- .../index/elastic/query/ElasticRequestHandler.java | 1 + .../inference/ElasticInferenceUsingConfigTest.java | 63 +++++++++++++++++++++- 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java index 2411079170..1c632c46d5 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java @@ -694,6 +694,7 @@ public class ElasticRequestHandler { knnQueryBuilder.field(InferenceConstants.VECTOR_SPACES + "." + inferenceModelConfigName + "." + InferenceConstants.VECTOR); knnQueryBuilder.numCandidates(inferenceModelConfig.getNumCandidates()); knnQueryBuilder.queryVector(embeddings); + knnQueryBuilder.similarity((float) inferenceModelConfig.getSimilarityThreshold()); // filters in knn are only applicable if filters are defined in knn query itself. // the filters outside knn query are applicable as post filters which can lead to missing results. if (planResult.evaluateNonFullTextConstraints()) { diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/inference/ElasticInferenceUsingConfigTest.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/inference/ElasticInferenceUsingConfigTest.java index 4904ca137c..99cacec41c 100644 --- a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/inference/ElasticInferenceUsingConfigTest.java +++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/inference/ElasticInferenceUsingConfigTest.java @@ -883,7 +883,7 @@ public class ElasticInferenceUsingConfigTest extends ElasticAbstractQueryTest { // Create inference configuration createInferenceConfig(jcrIndexName, true, defaultEnricherConfig, inferenceModelConfigName, - inferenceModelName, inferenceServiceUrl, 0.8, 1L, true, true); + inferenceModelName, inferenceServiceUrl, 0.7, 1L, true, true); setupEnricherStatus(defaultEnricherStatusMapping, defaultEnricherStatusData); // Create index definition with searchable properties @@ -989,4 +989,65 @@ public class ElasticInferenceUsingConfigTest extends ElasticAbstractQueryTest { assertEquals("/content/filterPath/ml", results.get(0)); }); } + + @Test + public void testSimilarityThresholdInKnnQuery() throws Exception { + String inferenceConfigInQuery = "?{}?"; + String jcrIndexName = UUID.randomUUID().toString(); + String inferenceServiceUrl = "http://localhost:" + wireMock.port() + "/v1/embeddings"; + String inferenceModelConfigName = "ada-test-model"; + String inferenceModelName = "text-embedding-ada-002"; + + // Create inference config + Double initialSimilarityThreshold = 0.2; + createInferenceConfig(jcrIndexName, true, defaultEnricherConfig, inferenceModelConfigName, + inferenceModelName, inferenceServiceUrl, initialSimilarityThreshold, 1L, true, true); + setupEnricherStatus(defaultEnricherStatusMapping, defaultEnricherStatusData); + // Create index definition with multiple properties + IndexDefinitionBuilder builder = createIndexDefinition("title", "description", "updatedBy"); + Tree index = setIndex(jcrIndexName, builder); + root.commit(); + + // Add test content + addTestContent(); + + // Let the index catch up + assertEventually(() -> assertEquals(7, countDocuments(index))); + + // Enrich documents with embeddings + setupEmbeddingsForContent(index, inferenceModelConfigName, inferenceModelName); + + // Setup wiremock stubs for inference service + setupMockInferenceService(inferenceModelConfigName, jcrIndexName); + + String searchQuery = "technological advancements in electric vehicles"; + String queryPath = "select [jcr:path] from [nt:base] where ISDESCENDANTNODE('/content') and contains(*, '" + + inferenceConfigInQuery + searchQuery + "')"; + LOG.info("Running initial query with similarity threshold {}: {}", initialSimilarityThreshold, queryPath); + assertEventually(() -> { + List<String> results = executeQuery(queryPath, SQL2, true, true); + LOG.info("Query with similarity threshold {} returned {} results: {}", + initialSimilarityThreshold, results.size(), results); + assertEquals(5, results.size()); + }); + + // update similarity threshold + double newThreshold = 0.8; + LOG.info("Updating similarity threshold from {} to {}", initialSimilarityThreshold, newThreshold); + // using same parameters as above apart from similarityThreshold, + // affectively updating similarityThreshold value. + createInferenceConfig(jcrIndexName, true, defaultEnricherConfig, inferenceModelConfigName, + inferenceModelName, inferenceServiceUrl, newThreshold, 1L, true, true); + InferenceConfig.reInitialize(); + + // With higher threshold number of documents should decrease + LOG.info("Running query with updated similarity threshold {}: {}", newThreshold, queryPath); + assertEventually(() -> { + List<String> results = executeQuery(queryPath, SQL2, true, true); + LOG.info("Query with similarity threshold {} returned {} results: {}", + newThreshold, results.size(), results); + assertEquals(1, results.size()); + }); + + } }