This is an automated email from the ASF dual-hosted git repository. abenedetti pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push: new 35a9b975eca SOLR-17632: Text to Vector Update Request Processor (#3151) 35a9b975eca is described below commit 35a9b975ecaa83b2d4bd571d05d427f305af0f16 Author: Alessandro Benedetti <a.benede...@sease.io> AuthorDate: Thu Mar 13 16:20:45 2025 +0000 SOLR-17632: Text to Vector Update Request Processor (#3151) * SOLR-17632: introducing update request processor + tests --------- Co-authored-by: Christine Poerschke <cpoersc...@apache.org> --- solr/CHANGES.txt | 2 + .../model/SolrTextToVectorModel.java | 6 +- .../model/package-info.java | 2 +- .../search/TextToVectorQParserPlugin.java | 6 +- .../search/package-info.java | 2 +- .../store/TextToVectorModelException.java | 2 +- .../store/TextToVectorModelStore.java | 4 +- .../store/package-info.java | 2 +- .../store/rest/ManagedTextToVectorModelStore.java | 8 +- .../store/rest/package-info.java | 2 +- .../processor/TextToVectorUpdateProcessor.java | 94 ++++++++++++ .../TextToVectorUpdateProcessorFactory.java | 121 +++++++++++++++ .../update/processor}/package-info.java | 4 +- .../modelExamples/dummy-model-ambiguous.json | 2 +- .../modelExamples/dummy-model-unsupported.json | 2 +- .../src/test-files/modelExamples/dummy-model.json | 2 +- .../modelExamples/exception-throwing-model.json | 6 + .../solr/collection1/conf/solrconfig-llm.xml | 22 ++- .../src/test/org/apache/solr/llm/TestLlmBase.java | 2 +- .../model/DummyEmbeddingModel.java | 2 +- .../model/DummyEmbeddingModelTest.java | 2 +- .../model/ExceptionThrowingEmbeddingModel.java | 54 +++++++ .../search/TextToVectorQParserTest.java | 8 +- .../store/rest/TestModelManager.java | 10 +- .../store/rest/TestModelManagerPersistence.java | 2 +- .../TextToVectorUpdateProcessorFactoryTest.java | 152 +++++++++++++++++++ .../processor/TextToVectorUpdateProcessorTest.java | 168 +++++++++++++++++++++ .../pages/update-request-processors.adoc | 6 + .../modules/query-guide/pages/text-to-vector.adoc | 150 +++++++++++++++++- .../java/org/apache/solr/util/RestTestBase.java | 22 ++- 30 files changed, 833 insertions(+), 34 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 725fe56e615..f17699d1191 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -167,6 +167,8 @@ New Features * SOLR-17309: Certificate based authentication plugin now has richer flexible cert principal resolution. (Lamine Idjeraoui via Eric Pugh) +* SOLR-17632: Added update request processor to encode text to vector at indexing time through external LLM services. (Alessandro Benedetti) + Improvements --------------------- * SOLR-15751: The v2 API now has parity with the v1 "COLSTATUS" and "segments" APIs, which can be used to fetch detailed information about diff --git a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/model/SolrTextToVectorModel.java b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/model/SolrTextToVectorModel.java similarity index 97% rename from solr/modules/llm/src/java/org/apache/solr/llm/texttovector/model/SolrTextToVectorModel.java rename to solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/model/SolrTextToVectorModel.java index f798f27db5d..e8c5bcf014a 100644 --- a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/model/SolrTextToVectorModel.java +++ b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/model/SolrTextToVectorModel.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.llm.texttovector.model; +package org.apache.solr.llm.textvectorisation.model; import dev.langchain4j.data.embedding.Embedding; import dev.langchain4j.model.embedding.EmbeddingModel; @@ -26,8 +26,8 @@ import java.util.Objects; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.RamUsageEstimator; import org.apache.solr.core.SolrResourceLoader; -import org.apache.solr.llm.texttovector.store.TextToVectorModelException; -import org.apache.solr.llm.texttovector.store.rest.ManagedTextToVectorModelStore; +import org.apache.solr.llm.textvectorisation.store.TextToVectorModelException; +import org.apache.solr.llm.textvectorisation.store.rest.ManagedTextToVectorModelStore; /** * This object wraps a {@link dev.langchain4j.model.embedding.EmbeddingModel} to encode text to diff --git a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/model/package-info.java b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/model/package-info.java similarity index 94% rename from solr/modules/llm/src/java/org/apache/solr/llm/texttovector/model/package-info.java rename to solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/model/package-info.java index 64e50f6f88b..3c4fb4d73d6 100644 --- a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/model/package-info.java +++ b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/model/package-info.java @@ -16,4 +16,4 @@ */ /** APIs and classes for implementing text to vector logic. */ -package org.apache.solr.llm.texttovector.model; +package org.apache.solr.llm.textvectorisation.model; diff --git a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/search/TextToVectorQParserPlugin.java b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/search/TextToVectorQParserPlugin.java similarity index 96% rename from solr/modules/llm/src/java/org/apache/solr/llm/texttovector/search/TextToVectorQParserPlugin.java rename to solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/search/TextToVectorQParserPlugin.java index 17749eba7b6..449b4b6dd76 100644 --- a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/search/TextToVectorQParserPlugin.java +++ b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/search/TextToVectorQParserPlugin.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.llm.texttovector.search; +package org.apache.solr.llm.textvectorisation.search; import java.io.IOException; import org.apache.lucene.index.VectorEncoding; @@ -26,8 +26,8 @@ import org.apache.solr.common.SolrException; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrResourceLoader; -import org.apache.solr.llm.texttovector.model.SolrTextToVectorModel; -import org.apache.solr.llm.texttovector.store.rest.ManagedTextToVectorModelStore; +import org.apache.solr.llm.textvectorisation.model.SolrTextToVectorModel; +import org.apache.solr.llm.textvectorisation.store.rest.ManagedTextToVectorModelStore; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.rest.ManagedResource; import org.apache.solr.rest.ManagedResourceObserver; diff --git a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/search/package-info.java b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/search/package-info.java similarity index 94% copy from solr/modules/llm/src/java/org/apache/solr/llm/texttovector/search/package-info.java copy to solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/search/package-info.java index 9fbf84e62c6..f188323c2c3 100644 --- a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/search/package-info.java +++ b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/search/package-info.java @@ -16,4 +16,4 @@ */ /** APIs and classes for implementing text to vector QueryParsers. */ -package org.apache.solr.llm.texttovector.search; +package org.apache.solr.llm.textvectorisation.search; diff --git a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/TextToVectorModelException.java b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/TextToVectorModelException.java similarity index 95% rename from solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/TextToVectorModelException.java rename to solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/TextToVectorModelException.java index 076f2b45f9b..d64b124ec5c 100644 --- a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/TextToVectorModelException.java +++ b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/TextToVectorModelException.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.llm.texttovector.store; +package org.apache.solr.llm.textvectorisation.store; public class TextToVectorModelException extends RuntimeException { diff --git a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/TextToVectorModelStore.java b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/TextToVectorModelStore.java similarity index 94% rename from solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/TextToVectorModelStore.java rename to solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/TextToVectorModelStore.java index cf1db239d44..e971e050957 100644 --- a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/TextToVectorModelStore.java +++ b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/TextToVectorModelStore.java @@ -14,14 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.llm.texttovector.store; +package org.apache.solr.llm.textvectorisation.store; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import org.apache.solr.llm.texttovector.model.SolrTextToVectorModel; +import org.apache.solr.llm.textvectorisation.model.SolrTextToVectorModel; /** Simple store to manage CRUD operations on the {@link SolrTextToVectorModel} */ public class TextToVectorModelStore { diff --git a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/package-info.java b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/package-info.java similarity index 94% rename from solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/package-info.java rename to solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/package-info.java index 630ac6085a8..36303a4c076 100644 --- a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/package-info.java +++ b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/package-info.java @@ -16,4 +16,4 @@ */ /** Contains model store related classes. */ -package org.apache.solr.llm.texttovector.store; +package org.apache.solr.llm.textvectorisation.store; diff --git a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/rest/ManagedTextToVectorModelStore.java b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/rest/ManagedTextToVectorModelStore.java similarity index 96% rename from solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/rest/ManagedTextToVectorModelStore.java rename to solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/rest/ManagedTextToVectorModelStore.java index 0652ec54d77..ffab7207075 100644 --- a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/rest/ManagedTextToVectorModelStore.java +++ b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/rest/ManagedTextToVectorModelStore.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.llm.texttovector.store.rest; +package org.apache.solr.llm.textvectorisation.store.rest; import java.lang.invoke.MethodHandles; import java.util.LinkedHashMap; @@ -26,9 +26,9 @@ import org.apache.solr.common.SolrException; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrResourceLoader; -import org.apache.solr.llm.texttovector.model.SolrTextToVectorModel; -import org.apache.solr.llm.texttovector.store.TextToVectorModelException; -import org.apache.solr.llm.texttovector.store.TextToVectorModelStore; +import org.apache.solr.llm.textvectorisation.model.SolrTextToVectorModel; +import org.apache.solr.llm.textvectorisation.store.TextToVectorModelException; +import org.apache.solr.llm.textvectorisation.store.TextToVectorModelStore; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.rest.BaseSolrResource; import org.apache.solr.rest.ManagedResource; diff --git a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/rest/package-info.java b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/rest/package-info.java similarity index 93% rename from solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/rest/package-info.java rename to solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/rest/package-info.java index 56ae30f2ebe..118bc245e30 100644 --- a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/store/rest/package-info.java +++ b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/store/rest/package-info.java @@ -16,4 +16,4 @@ */ /** Contains the {@link org.apache.solr.rest.ManagedResource} that encapsulate the model stores. */ -package org.apache.solr.llm.texttovector.store.rest; +package org.apache.solr.llm.textvectorisation.store.rest; diff --git a/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/update/processor/TextToVectorUpdateProcessor.java b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/update/processor/TextToVectorUpdateProcessor.java new file mode 100644 index 00000000000..cac6d4283aa --- /dev/null +++ b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/update/processor/TextToVectorUpdateProcessor.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.llm.textvectorisation.update.processor; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.List; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.SolrInputField; +import org.apache.solr.llm.textvectorisation.model.SolrTextToVectorModel; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.solr.update.processor.UpdateRequestProcessor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class TextToVectorUpdateProcessor extends UpdateRequestProcessor { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private IndexSchema schema; + private final String inputField; + private final String outputField; + private SolrTextToVectorModel textToVector; + + public TextToVectorUpdateProcessor( + String inputField, + String outputField, + SolrTextToVectorModel textToVector, + SolrQueryRequest req, + UpdateRequestProcessor next) { + super(next); + this.schema = req.getSchema(); + this.inputField = inputField; + this.outputField = outputField; + this.textToVector = textToVector; + } + + /** + * @param cmd the update command in input containing the Document to process + * @throws IOException If there is a low-level I/O error + */ + @Override + public void processAdd(AddUpdateCommand cmd) throws IOException { + SolrInputDocument doc = cmd.getSolrInputDocument(); + SolrInputField inputFieldContent = doc.get(inputField); + if (!isNullOrEmpty(inputFieldContent)) { + try { + String textToVectorise = inputFieldContent.getValue().toString(); + float[] vector = textToVector.vectorise(textToVectorise); + List<Float> vectorAsList = new ArrayList<Float>(vector.length); + for (float f : vector) { + vectorAsList.add(f); + } + doc.addField(outputField, vectorAsList); + } catch (RuntimeException vectorisationFailure) { + if (log.isErrorEnabled()) { + SchemaField uniqueKeyField = schema.getUniqueKeyField(); + String uniqueKeyFieldName = uniqueKeyField.getName(); + log.error( + "Could not vectorise: {} for the document with {}: {}", + inputField, + uniqueKeyFieldName, + doc.getFieldValue(uniqueKeyFieldName), + vectorisationFailure); + } + } + } + super.processAdd(cmd); + } + + protected boolean isNullOrEmpty(SolrInputField inputFieldContent) { + return (inputFieldContent == null + || inputFieldContent.getValue() == null + || inputFieldContent.getValue().toString().isEmpty()); + } +} diff --git a/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/update/processor/TextToVectorUpdateProcessorFactory.java b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/update/processor/TextToVectorUpdateProcessorFactory.java new file mode 100644 index 00000000000..559ea9a4309 --- /dev/null +++ b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/update/processor/TextToVectorUpdateProcessorFactory.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.llm.textvectorisation.update.processor; + +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.RequiredSolrParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.llm.textvectorisation.model.SolrTextToVectorModel; +import org.apache.solr.llm.textvectorisation.store.rest.ManagedTextToVectorModelStore; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.DenseVectorField; +import org.apache.solr.schema.FieldType; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.update.processor.UpdateRequestProcessor; +import org.apache.solr.update.processor.UpdateRequestProcessorFactory; + +/** + * Vectorises a textual field value and add the resulting vector to another field. + * + * <p>The parameters supported are: + * + * <pre class="prettyprint" > + * <processor class="solr.llm.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory"> + * <str name="inputField">textualField</str> + * <str name="outputField">vectorField</str> + * <str name="model">textToVectorModel</str> + * </processor> + * </pre> + * + * * + */ +public class TextToVectorUpdateProcessorFactory extends UpdateRequestProcessorFactory { + private static final String INPUT_FIELD_PARAM = "inputField"; + private static final String OUTPUT_FIELD_PARAM = "outputField"; + private static final String MODEL_NAME = "model"; + + private String inputField; + private String outputField; + private String modelName; + private SolrParams params; + + @Override + public void init(final NamedList<?> args) { + params = args.toSolrParams(); + RequiredSolrParams required = params.required(); + inputField = required.get(INPUT_FIELD_PARAM); + outputField = required.get(OUTPUT_FIELD_PARAM); + modelName = required.get(MODEL_NAME); + } + + @Override + public UpdateRequestProcessor getInstance( + SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { + IndexSchema latestSchema = req.getCore().getLatestSchema(); + if (!latestSchema.hasExplicitField(inputField)) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "undefined field: \"" + inputField + "\""); + } + if (!latestSchema.hasExplicitField(outputField)) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "undefined field: \"" + outputField + "\""); + } + + final SchemaField outputFieldSchema = latestSchema.getField(outputField); + assertIsDenseVectorField(outputFieldSchema); + + ManagedTextToVectorModelStore modelStore = + ManagedTextToVectorModelStore.getManagedModelStore(req.getCore()); + SolrTextToVectorModel textToVector = modelStore.getModel(modelName); + if (textToVector == null) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "The model configured in the Update Request Processor '" + + modelName + + "' can't be found in the store: " + + ManagedTextToVectorModelStore.REST_END_POINT); + } + + return new TextToVectorUpdateProcessor(inputField, outputField, textToVector, req, next); + } + + protected void assertIsDenseVectorField(SchemaField schemaField) { + FieldType fieldType = schemaField.getType(); + if (!(fieldType instanceof DenseVectorField)) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "only DenseVectorField is compatible with Vector Query Parsers: " + + schemaField.getName()); + } + } + + public String getInputField() { + return inputField; + } + + public String getOutputField() { + return outputField; + } + + public String getModelName() { + return modelName; + } +} diff --git a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/search/package-info.java b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/update/processor/package-info.java similarity index 86% rename from solr/modules/llm/src/java/org/apache/solr/llm/texttovector/search/package-info.java rename to solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/update/processor/package-info.java index 9fbf84e62c6..2c5f2ef0072 100644 --- a/solr/modules/llm/src/java/org/apache/solr/llm/texttovector/search/package-info.java +++ b/solr/modules/llm/src/java/org/apache/solr/llm/textvectorisation/update/processor/package-info.java @@ -15,5 +15,5 @@ * limitations under the License. */ -/** APIs and classes for implementing text to vector QueryParsers. */ -package org.apache.solr.llm.texttovector.search; +/** Contains update request processor related classes. */ +package org.apache.solr.llm.textvectorisation.update.processor; diff --git a/solr/modules/llm/src/test-files/modelExamples/dummy-model-ambiguous.json b/solr/modules/llm/src/test-files/modelExamples/dummy-model-ambiguous.json index 43de925cf9d..417ce5b11a6 100644 --- a/solr/modules/llm/src/test-files/modelExamples/dummy-model-ambiguous.json +++ b/solr/modules/llm/src/test-files/modelExamples/dummy-model-ambiguous.json @@ -1,5 +1,5 @@ { - "class": "org.apache.solr.llm.texttovector.model.DummyEmbeddingModel", + "class": "org.apache.solr.llm.textvectorisation.model.DummyEmbeddingModel", "name": "dummy-1", "params": { "embedding": [1.0, 2.0, 3.0, 4.0], diff --git a/solr/modules/llm/src/test-files/modelExamples/dummy-model-unsupported.json b/solr/modules/llm/src/test-files/modelExamples/dummy-model-unsupported.json index 9af02f14003..7316cd7dd03 100644 --- a/solr/modules/llm/src/test-files/modelExamples/dummy-model-unsupported.json +++ b/solr/modules/llm/src/test-files/modelExamples/dummy-model-unsupported.json @@ -1,5 +1,5 @@ { - "class": "org.apache.solr.llm.texttovector.model.DummyEmbeddingModel", + "class": "org.apache.solr.llm.textvectorisation.model.DummyEmbeddingModel", "name": "dummy-1", "params": { "embedding": [1.0, 2.0, 3.0, 4.0], diff --git a/solr/modules/llm/src/test-files/modelExamples/dummy-model.json b/solr/modules/llm/src/test-files/modelExamples/dummy-model.json index 00603b8369b..750344f37c5 100644 --- a/solr/modules/llm/src/test-files/modelExamples/dummy-model.json +++ b/solr/modules/llm/src/test-files/modelExamples/dummy-model.json @@ -1,5 +1,5 @@ { - "class": "org.apache.solr.llm.texttovector.model.DummyEmbeddingModel", + "class": "org.apache.solr.llm.textvectorisation.model.DummyEmbeddingModel", "name": "dummy-1", "params": { "embedding": [1.0, 2.0, 3.0, 4.0] diff --git a/solr/modules/llm/src/test-files/modelExamples/exception-throwing-model.json b/solr/modules/llm/src/test-files/modelExamples/exception-throwing-model.json new file mode 100644 index 00000000000..c058da250d7 --- /dev/null +++ b/solr/modules/llm/src/test-files/modelExamples/exception-throwing-model.json @@ -0,0 +1,6 @@ +{ + "class": "org.apache.solr.llm.textvectorisation.model.ExceptionThrowingEmbeddingModel", + "name": "exception-throwing-model", + "params": { + } +} diff --git a/solr/modules/llm/src/test-files/solr/collection1/conf/solrconfig-llm.xml b/solr/modules/llm/src/test-files/solr/collection1/conf/solrconfig-llm.xml index 3a1d05285fc..005098bda8b 100644 --- a/solr/modules/llm/src/test-files/solr/collection1/conf/solrconfig-llm.xml +++ b/solr/modules/llm/src/test-files/solr/collection1/conf/solrconfig-llm.xml @@ -23,7 +23,7 @@ <!-- Query parser used to run neural queries--> <queryParser name="knn_text_to_vector" - class="org.apache.solr.llm.texttovector.search.TextToVectorQParserPlugin" /> + class="org.apache.solr.llm.textvectorisation.search.TextToVectorQParserPlugin" /> <query> <filterCache class="solr.CaffeineCache" size="4096" @@ -54,4 +54,24 @@ </lst> </requestHandler> + <updateRequestProcessorChain name="textToVector"> + <processor class="solr.llm.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory"> + <str name="inputField">_text_</str> + <str name="outputField">vector</str> + <str name="model">dummy-1</str> + </processor> + <processor class="solr.RunUpdateProcessorFactory"/> + </updateRequestProcessorChain> + + <updateRequestProcessorChain name="failingTextToVector"> + <processor class="solr.llm.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory"> + <str name="inputField">_text_</str> + <str name="outputField">vector</str> + <str name="model">exception-throwing-model</str> + </processor> + <processor class="solr.RunUpdateProcessorFactory"/> + </updateRequestProcessorChain> + + + </config> diff --git a/solr/modules/llm/src/test/org/apache/solr/llm/TestLlmBase.java b/solr/modules/llm/src/test/org/apache/solr/llm/TestLlmBase.java index 6fb391ad364..6cd6ed2b5ae 100644 --- a/solr/modules/llm/src/test/org/apache/solr/llm/TestLlmBase.java +++ b/solr/modules/llm/src/test/org/apache/solr/llm/TestLlmBase.java @@ -26,7 +26,7 @@ import java.util.Arrays; import java.util.List; import org.apache.commons.io.file.PathUtils; import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.llm.texttovector.store.rest.ManagedTextToVectorModelStore; +import org.apache.solr.llm.textvectorisation.store.rest.ManagedTextToVectorModelStore; import org.apache.solr.util.RestTestBase; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/solr/modules/llm/src/test/org/apache/solr/llm/texttovector/model/DummyEmbeddingModel.java b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/model/DummyEmbeddingModel.java similarity index 98% rename from solr/modules/llm/src/test/org/apache/solr/llm/texttovector/model/DummyEmbeddingModel.java rename to solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/model/DummyEmbeddingModel.java index 00edcba114b..049d1cded7e 100644 --- a/solr/modules/llm/src/test/org/apache/solr/llm/texttovector/model/DummyEmbeddingModel.java +++ b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/model/DummyEmbeddingModel.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.llm.texttovector.model; +package org.apache.solr.llm.textvectorisation.model; import dev.langchain4j.data.embedding.Embedding; import dev.langchain4j.data.segment.TextSegment; diff --git a/solr/modules/llm/src/test/org/apache/solr/llm/texttovector/model/DummyEmbeddingModelTest.java b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/model/DummyEmbeddingModelTest.java similarity index 96% rename from solr/modules/llm/src/test/org/apache/solr/llm/texttovector/model/DummyEmbeddingModelTest.java rename to solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/model/DummyEmbeddingModelTest.java index 823f591fb95..10f5aae04a3 100644 --- a/solr/modules/llm/src/test/org/apache/solr/llm/texttovector/model/DummyEmbeddingModelTest.java +++ b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/model/DummyEmbeddingModelTest.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.llm.texttovector.model; +package org.apache.solr.llm.textvectorisation.model; import org.apache.solr.SolrTestCase; import org.junit.Test; diff --git a/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/model/ExceptionThrowingEmbeddingModel.java b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/model/ExceptionThrowingEmbeddingModel.java new file mode 100644 index 00000000000..7c69ed8352d --- /dev/null +++ b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/model/ExceptionThrowingEmbeddingModel.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.llm.textvectorisation.model; + +import dev.langchain4j.data.embedding.Embedding; +import dev.langchain4j.data.segment.TextSegment; +import dev.langchain4j.model.embedding.EmbeddingModel; +import dev.langchain4j.model.output.Response; +import java.util.List; + +public class ExceptionThrowingEmbeddingModel implements EmbeddingModel { + + @Override + public Response<Embedding> embed(String text) { + throw new RuntimeException("Failed to vectorise"); + } + + @Override + public Response<Embedding> embed(TextSegment textSegment) { + throw new RuntimeException("Failed to vectorise"); + } + + @Override + public Response<List<Embedding>> embedAll(List<TextSegment> textSegments) { + throw new RuntimeException("Failed to vectorise"); + } + + public static ExceptionThrowingEmbeddingModel.ExceptionThrowingEmbeddingModelBuilder builder() { + return new ExceptionThrowingEmbeddingModel.ExceptionThrowingEmbeddingModelBuilder(); + } + + public static class ExceptionThrowingEmbeddingModelBuilder { + + public ExceptionThrowingEmbeddingModelBuilder() {} + + public ExceptionThrowingEmbeddingModel build() { + return new ExceptionThrowingEmbeddingModel(); + } + } +} diff --git a/solr/modules/llm/src/test/org/apache/solr/llm/texttovector/search/TextToVectorQParserTest.java b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/search/TextToVectorQParserTest.java similarity index 98% rename from solr/modules/llm/src/test/org/apache/solr/llm/texttovector/search/TextToVectorQParserTest.java rename to solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/search/TextToVectorQParserTest.java index 5c406f08217..516d1b17e2f 100644 --- a/solr/modules/llm/src/test/org/apache/solr/llm/texttovector/search/TextToVectorQParserTest.java +++ b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/search/TextToVectorQParserTest.java @@ -14,11 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.llm.texttovector.search; +package org.apache.solr.llm.textvectorisation.search; import java.util.Arrays; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.llm.TestLlmBase; +import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -29,6 +30,11 @@ public class TextToVectorQParserTest extends TestLlmBase { loadModel("dummy-model.json"); } + @AfterClass + public static void cleanup() throws Exception { + afterTest(); + } + @Test public void notExistentModel_shouldThrowException() throws Exception { final String solrQuery = "{!knn_text_to_vector model=not-exist f=vector topK=5}hello world"; diff --git a/solr/modules/llm/src/test/org/apache/solr/llm/texttovector/store/rest/TestModelManager.java b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/store/rest/TestModelManager.java similarity index 97% rename from solr/modules/llm/src/test/org/apache/solr/llm/texttovector/store/rest/TestModelManager.java rename to solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/store/rest/TestModelManager.java index 37d40b3f6c4..05e7f3bb0e9 100644 --- a/solr/modules/llm/src/test/org/apache/solr/llm/texttovector/store/rest/TestModelManager.java +++ b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/store/rest/TestModelManager.java @@ -14,15 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.llm.texttovector.store.rest; +package org.apache.solr.llm.textvectorisation.store.rest; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.llm.TestLlmBase; -import org.apache.solr.llm.texttovector.search.TextToVectorQParserPlugin; +import org.apache.solr.llm.textvectorisation.search.TextToVectorQParserPlugin; import org.apache.solr.rest.ManagedResource; import org.apache.solr.rest.ManagedResourceStorage; import org.apache.solr.rest.RestManager; +import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -33,6 +34,11 @@ public class TestModelManager extends TestLlmBase { setupTest("solrconfig-llm.xml", "schema.xml", false, false); } + @AfterClass + public static void cleanup() throws Exception { + afterTest(); + } + @Test public void test() throws Exception { final SolrResourceLoader loader = new SolrResourceLoader(tmpSolrHome); diff --git a/solr/modules/llm/src/test/org/apache/solr/llm/texttovector/store/rest/TestModelManagerPersistence.java b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/store/rest/TestModelManagerPersistence.java similarity index 98% rename from solr/modules/llm/src/test/org/apache/solr/llm/texttovector/store/rest/TestModelManagerPersistence.java rename to solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/store/rest/TestModelManagerPersistence.java index 798e2f091b6..a7ed7923eeb 100644 --- a/solr/modules/llm/src/test/org/apache/solr/llm/texttovector/store/rest/TestModelManagerPersistence.java +++ b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/store/rest/TestModelManagerPersistence.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.llm.texttovector.store.rest; +package org.apache.solr.llm.textvectorisation.store.rest; import static java.nio.charset.StandardCharsets.UTF_8; diff --git a/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/update/processor/TextToVectorUpdateProcessorFactoryTest.java b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/update/processor/TextToVectorUpdateProcessorFactoryTest.java new file mode 100644 index 00000000000..f01aa187537 --- /dev/null +++ b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/update/processor/TextToVectorUpdateProcessorFactoryTest.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.llm.textvectorisation.update.processor; + +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; +import org.apache.solr.llm.TestLlmBase; +import org.apache.solr.request.SolrQueryRequestBase; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class TextToVectorUpdateProcessorFactoryTest extends TestLlmBase { + + @BeforeClass + public static void init() throws Exception { + setupTest("solrconfig-llm.xml", "schema.xml", false, false); + } + + @AfterClass + public static void cleanup() throws Exception { + afterTest(); + } + + @Test + public void init_fullArgs_shouldInitAllParams() { + NamedList<String> args = new NamedList<>(); + args.add("inputField", "_text_"); + args.add("outputField", "vector"); + args.add("model", "model1"); + + TextToVectorUpdateProcessorFactory factoryToTest = new TextToVectorUpdateProcessorFactory(); + factoryToTest.init(args); + + assertEquals("_text_", factoryToTest.getInputField()); + assertEquals("vector", factoryToTest.getOutputField()); + assertEquals("model1", factoryToTest.getModelName()); + } + + @Test + public void init_nullInputField_shouldThrowExceptionWithDetailedMessage() { + NamedList<String> args = new NamedList<>(); + args.add("outputField", "vector"); + args.add("model", "model1"); + + TextToVectorUpdateProcessorFactory factoryToTest = new TextToVectorUpdateProcessorFactory(); + + SolrException e = assertThrows(SolrException.class, () -> factoryToTest.init(args)); + assertEquals("Missing required parameter: inputField", e.getMessage()); + } + + @Test + public void init_nullOutputField_shouldThrowExceptionWithDetailedMessage() { + NamedList<String> args = new NamedList<>(); + args.add("inputField", "_text_"); + args.add("model", "model1"); + + TextToVectorUpdateProcessorFactory factoryToTest = new TextToVectorUpdateProcessorFactory(); + + SolrException e = assertThrows(SolrException.class, () -> factoryToTest.init(args)); + assertEquals("Missing required parameter: outputField", e.getMessage()); + } + + @Test + public void init_nullModel_shouldThrowExceptionWithDetailedMessage() { + NamedList<String> args = new NamedList<>(); + args.add("inputField", "_text_"); + args.add("outputField", "vector"); + + TextToVectorUpdateProcessorFactory factoryToTest = new TextToVectorUpdateProcessorFactory(); + + SolrException e = assertThrows(SolrException.class, () -> factoryToTest.init(args)); + assertEquals("Missing required parameter: model", e.getMessage()); + } + + /* Following test depends on a real solr schema and depends on BeforeClass-AfterClass methods */ + @Test + public void init_notExistentOutputField_shouldThrowExceptionWithDetailedMessage() { + NamedList<String> args = new NamedList<>(); + args.add("inputField", "_text_"); + args.add("outputField", "notExistentOutput"); + args.add("model", "model1"); + + TextToVectorUpdateProcessorFactory factoryToTest = new TextToVectorUpdateProcessorFactory(); + + ModifiableSolrParams params = new ModifiableSolrParams(); + SolrCore collection1 = solrClientTestRule.getCoreContainer().getCore("collection1"); + SolrQueryRequestBase req = new SolrQueryRequestBase(collection1, params) {}; + factoryToTest.init(args); + SolrException e = + assertThrows(SolrException.class, () -> factoryToTest.getInstance(req, null, null)); + assertEquals("undefined field: \"notExistentOutput\"", e.getMessage()); + collection1.close(); + } + + /* Following test depends on a real solr schema and depends on BeforeClass-AfterClass methods */ + @Test + public void init_notDenseVectorOutputField_shouldThrowExceptionWithDetailedMessage() { + NamedList<String> args = new NamedList<>(); + args.add("inputField", "_text_"); + args.add("outputField", "_text_"); + args.add("model", "model1"); + + TextToVectorUpdateProcessorFactory factoryToTest = new TextToVectorUpdateProcessorFactory(); + + ModifiableSolrParams params = new ModifiableSolrParams(); + SolrCore collection1 = solrClientTestRule.getCoreContainer().getCore("collection1"); + SolrQueryRequestBase req = new SolrQueryRequestBase(collection1, params) {}; + factoryToTest.init(args); + SolrException e = + assertThrows(SolrException.class, () -> factoryToTest.getInstance(req, null, null)); + assertEquals( + "only DenseVectorField is compatible with Vector Query Parsers: _text_", e.getMessage()); + collection1.close(); + } + + /* Following test depends on a real solr schema and depends on BeforeClass-AfterClass methods */ + @Test + public void init_notExistentInputField_shouldThrowExceptionWithDetailedMessage() { + NamedList<String> args = new NamedList<>(); + args.add("inputField", "notExistentInput"); + args.add("outputField", "vector"); + args.add("model", "model1"); + + TextToVectorUpdateProcessorFactory factoryToTest = new TextToVectorUpdateProcessorFactory(); + + ModifiableSolrParams params = new ModifiableSolrParams(); + SolrCore collection1 = solrClientTestRule.getCoreContainer().getCore("collection1"); + SolrQueryRequestBase req = new SolrQueryRequestBase(collection1, params) {}; + factoryToTest.init(args); + SolrException e = + assertThrows(SolrException.class, () -> factoryToTest.getInstance(req, null, null)); + assertEquals("undefined field: \"notExistentInput\"", e.getMessage()); + collection1.close(); + } +} diff --git a/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/update/processor/TextToVectorUpdateProcessorTest.java b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/update/processor/TextToVectorUpdateProcessorTest.java new file mode 100644 index 00000000000..8614d637c5f --- /dev/null +++ b/solr/modules/llm/src/test/org/apache/solr/llm/textvectorisation/update/processor/TextToVectorUpdateProcessorTest.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.llm.textvectorisation.update.processor; + +import java.io.IOException; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.request.UpdateRequest; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.llm.TestLlmBase; +import org.apache.solr.llm.textvectorisation.store.rest.ManagedTextToVectorModelStore; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class TextToVectorUpdateProcessorTest extends TestLlmBase { + + @BeforeClass + public static void init() throws Exception { + setupTest("solrconfig-llm.xml", "schema.xml", false, false); + } + + @AfterClass + public static void cleanup() throws Exception { + afterTest(); + } + + @Test + public void processAdd_inputField_shouldVectoriseInputField() throws Exception { + loadModel("dummy-model.json"); // preparation + + addWithChain(sdoc("id", "99", "_text_", "Vegeta is the saiyan prince."), "textToVector"); + addWithChain( + sdoc("id", "98", "_text_", "Kakaroth is a saiyan grown up on planet Earth."), + "textToVector"); + assertU(commit()); + + final String solrQuery = "*:*"; + final SolrQuery query = new SolrQuery(); + query.setQuery(solrQuery); + query.add("fl", "id,vector"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/vector==[1.0, 2.0, 3.0, 4.0]", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/vector==[1.0, 2.0, 3.0, 4.0]"); + + restTestHarness.delete(ManagedTextToVectorModelStore.REST_END_POINT + "/dummy-1"); // clean up + } + + /* + This test looks for the 'dummy-1' model, but such model is not loaded, the model store is empty, so the update fails + */ + @Test + public void processAdd_modelNotFound_shouldThrowException() { + RuntimeException thrown = + assertThrows( + "model not found should throw an exception", + SolrClient.RemoteSolrException.class, + () -> { + addWithChain( + sdoc("id", "99", "_text_", "Vegeta is the saiyan prince."), "textToVector"); + }); + assertTrue( + thrown + .getMessage() + .contains( + "The model configured in the Update Request Processor 'dummy-1' can't be found in the store: /schema/text-to-vector-model-store")); + } + + @Test + public void processAdd_emptyInputField_shouldLogAndIndexWithNoVector() throws Exception { + loadModel("dummy-model.json"); // preparation + addWithChain(sdoc("id", "99", "_text_", ""), "textToVector"); + addWithChain(sdoc("id", "98", "_text_", "Vegeta is the saiyan prince."), "textToVector"); + assertU(commit()); + + final String solrQuery = "*:*"; + final SolrQuery query = new SolrQuery(); + query.setQuery(solrQuery); + query.add("fl", "id,vector"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "!/response/docs/[0]/vector==", // no vector field for the document 99 + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/vector==[1.0, 2.0, 3.0, 4.0]"); + + restTestHarness.delete(ManagedTextToVectorModelStore.REST_END_POINT + "/dummy-1"); // clean up + } + + @Test + public void processAdd_nullInputField_shouldLogAndIndexWithNoVector() throws Exception { + loadModel("dummy-model.json"); // preparation + addWithChain(sdoc("id", "99", "_text_", "Vegeta is the saiyan prince."), "textToVector"); + assertU(adoc("id", "98")); + assertU(commit()); + + final String solrQuery = "*:*"; + final SolrQuery query = new SolrQuery(); + query.setQuery(solrQuery); + query.add("fl", "id,vector"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/vector==[1.0, 2.0, 3.0, 4.0]", + "/response/docs/[1]/id=='98'", + "!/response/docs/[1]/vector=="); // no vector field for the document 98 + + restTestHarness.delete(ManagedTextToVectorModelStore.REST_END_POINT + "/dummy-1"); // clean up + } + + @Test + public void processAdd_failingVectorisation_shouldLogAndIndexWithNoVector() throws Exception { + loadModel("exception-throwing-model.json"); // preparation + addWithChain(sdoc("id", "99", "_text_", "Vegeta is the saiyan prince."), "failingTextToVector"); + addWithChain( + sdoc("id", "98", "_text_", "Kakaroth is a saiyan grown up on planet Earth."), + "failingTextToVector"); + assertU(commit()); + + final String solrQuery = "*:*"; + final SolrQuery query = new SolrQuery(); + query.setQuery(solrQuery); + query.add("fl", "id,vector"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "!/response/docs/[0]/vector==", // no vector field for the document 99 + "/response/docs/[1]/id=='98'", + "!/response/docs/[1]/vector=="); // no vector field for the document 98 + + restTestHarness.delete( + ManagedTextToVectorModelStore.REST_END_POINT + "/exception-throwing-model"); // clean up + } + + void addWithChain(SolrInputDocument document, String updateChain) + throws SolrServerException, IOException { + UpdateRequest req = new UpdateRequest(); + req.add(document); + req.setParam("update.chain", updateChain); + solrClientTestRule.getSolrClient("collection1").request(req); + } +} diff --git a/solr/solr-ref-guide/modules/configuration-guide/pages/update-request-processors.adoc b/solr/solr-ref-guide/modules/configuration-guide/pages/update-request-processors.adoc index fe2c22203e8..4a0b2a801f9 100644 --- a/solr/solr-ref-guide/modules/configuration-guide/pages/update-request-processors.adoc +++ b/solr/solr-ref-guide/modules/configuration-guide/pages/update-request-processors.adoc @@ -415,6 +415,12 @@ When using any of these factories, please consult the {solr-javadocs}/core/org/a These processors are included in Solr releases as "module", and require additional jars loaded at runtime. See the README files associated with each module for details: +The {solr-javadocs}/modules/llm/index.html[`llm`] module provides:: + +{solr-javadocs}/modules/llm/org/apache/solr/llm/textvectorisation/update/processor/TextToVectorUpdateProcessorFactory.html[TextToVectorUpdateProcessorFactory]:: Update processor which vectorises a textual field in input and adds the resulting vector as the value of a new field. +It uses external text to vectors LLM to perform the vectorisation for each processed document. +For more information: xref:query-guide:text-to-vector.adoc[Update Request Processor] + The {solr-javadocs}/modules/langid/index.html[`langid`] module provides:: {solr-javadocs}/modules/langid/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessorFactory.html[LangDetectLanguageIdentifierUpdateProcessorFactory]::: Identifies the language of a set of input fields using http://code.google.com/p/language-detection. diff --git a/solr/solr-ref-guide/modules/query-guide/pages/text-to-vector.adoc b/solr/solr-ref-guide/modules/query-guide/pages/text-to-vector.adoc index 664e3e71af8..cf0545ed030 100644 --- a/solr/solr-ref-guide/modules/query-guide/pages/text-to-vector.adoc +++ b/solr/solr-ref-guide/modules/query-guide/pages/text-to-vector.adoc @@ -16,7 +16,11 @@ // specific language governing permissions and limitations // under the License. -This module brings the power of *Large Language Models* (*LLM*s) to Solr. More specifically, it provides a text-to-vector capability, used on documents or queries, via integrating with popular external services that do this. The state-of-the-art of such services use an LLM, hence the name of this module. +This module brings the power of *Large Language Models* (*LLM*s) to Solr. + +More specifically, it provides a text-to-vector capability, used on documents or queries, via integrating with popular external services that do this. + +The state-of-the-art of such services use an LLM, hence the name of this module. _Without_ this module, vectors must be supplied _to_ Solr for indexing & searching, possibly coordinating with such services. @@ -25,6 +29,9 @@ _Without_ this module, vectors must be supplied _to_ Solr for indexing & searchi === From Text to Vector The aim of encoding text to numerical vectors is to represent text in a way that semantically similar sentences are encoded to vectors close in a vector space. + +Often this process is called 'text embedding' as it projects a piece of text into a high-dimensional latent vector space and embeds the text with such vector. + Vector distance metrics (algorithms) can then be used to compute a pairwise similarity, producing a score. @@ -32,6 +39,8 @@ Vector distance metrics (algorithms) can then be used to compute a pairwise simi Specific Large Language Models are able to encode text to a numerical vector. +These models are often called Embedding Models as they encode text to vector embeddings. + For additional information you can refer to this https://sease.io/2021/12/using-bert-to-improve-search-relevance.html[blog post]. ==== Text to Vector Online Services @@ -69,9 +78,10 @@ You need to register / configure the plugins provided by the LLM module that you + [source,xml] ---- -<queryParser name="knn_text_to_vector" class="org.apache.solr.llm.texttovector.search.TextToVectorQParserPlugin"/> +<queryParser name="knn_text_to_vector" class="org.apache.solr.llm.textvectorisation.search.TextToVectorQParserPlugin"/> ---- + == Text to Vector Lifecycle @@ -259,8 +269,142 @@ http://localhost:8983/solr/techproducts/schema/text-to-vector-model-store ---- +Documentation Indexing time + + +=== Enriching documents with vectors at indexing time +To vectorise textual fields of your documents at indexing time you need to configure an {solr-javadocs}/core/org/apache/solr/update/processor/UpdateRequestProcessorChain.html[Update Request Processor Chain] that includes at least one `TextToVectorUpdateProcessor` update request processor (you can include more than one, if you want to vectorise multiple fields): + +[source,xml] +---- +<updateRequestProcessorChain name="textToVector"> + <processor class="solr.llm.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory"> + <str name="inputField">_text_</str> + <str name="outputField">vector</str> + <str name="model">dummy-1</str> + </processor> + <processor class="solr.RunUpdateProcessorFactory"/> + </updateRequestProcessorChain> +---- + +The `TextToVectorUpdateProcessor` update request processor vectorises the content of the 'inputField' for each document processed at indexing time. + +The resulting vector is added as a value for the 'outputField'. + +To perform the vectorisation it leverages a 'model' you have previously uploaded in the text-to-vector-model-store. + +[IMPORTANT] +==== +This update processor sends your document field content off to some hosted service on the internet. +There are serious performance implications that should be diligently examined before employing this component in production. +It will slow down substantially your indexing pipeline so make sure to stress test your solution before going live. + +==== + +For more details on how to work with update request processors in Apache Solr, please refer to the dedicated page: xref:configuration-guide:update-request-processors.adoc[Update Request Processor] + +=== Index first and enrich your documents with vectors on a second pass +Vectorising text using a hosted service may be slow, so, depending on your use case it could be a good idea to index first your documents and then add vectors iteratively. + +This can be done in Solr defining two update request processors chains: one that includes all the processors you need, excluded the TextToVectorUpdateProcessor (let's call it 'no-vectorisation') and one that includes only the TextToVectorUpdateProcessor (let's call it 'vectorisation'). + +[source,xml] +---- +<updateRequestProcessorChain name="no-vectorisation"> +<processor class="solr.processor1"> + ... + </processor> +... +<processor class="solr.processorN"> + ... + </processor> + <processor class="solr.RunUpdateProcessorFactory"/> + </updateRequestProcessorChain> +---- + +[source,xml] +---- +<updateRequestProcessorChain name="vectorisation"> +<processor class="solr.processor1"> + ... + </processor> +... +<processor class="solr.processorN"> + ... + </processor> +<processor class="solr.llm.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory"> + <str name="inputField">_text_</str> + <str name="outputField">vector</str> + <str name="model">dummy-1</str> + </processor> + <processor class="solr.RunUpdateProcessorFactory"/> + </updateRequestProcessorChain> +---- + +You would index your documents first using the 'no-vectorisation' and when finished, incrementally repeat the indexing targeting the 'vectorisation' chain. + +[IMPORTANT] +==== +This implies you need to send the documents you want to index to Solr twice and re-run any other update request processor you need, in the second chain. +This has data traffic implications(you transfer your documents over the network twice) and processing implications (if you have other update request processors in your chain, those must be repeated the second time as we are literally replacing the indexed documents one by one). +==== + +If your use case is compatible with xref:indexing-guide:partial-document-updates.adoc[Partial Updates], you can do better: + +You still define two chains, but this time the 'vectorisation' one only includes the 'TextToVectorUpdateProcessor' (and the xref:configuration-guide:update-request-processors.adoc[Mandatory Processors] ) + +[source,xml] +---- +<updateRequestProcessorChain name="no-vectorisation"> +<processor class="solr.processor1"> + ... + </processor> +... +<processor class="solr.processorN"> + ... + </processor> + <processor class="solr.RunUpdateProcessorFactory"/> + </updateRequestProcessorChain> +---- + +[source,xml] +---- +<updateRequestProcessorChain name="vectorisation"> +<processor class="solr.llm.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory"> + <str name="inputField">_text_</str> + <str name="outputField">vector</str> + <str name="model">dummy-1</str> + </processor> + <processor class="solr.RunUpdateProcessorFactory"/> + </updateRequestProcessorChain> +---- + +Add to your schema a simple field that will be useful to track the vectorisation and use atomic updates: + +[source,xml] +---- +<field name="vectorised" type="boolean" indexed="true" stored="false" docValues="true" default="false"/> + +---- + +In the first pass just index your documents using your reliable and fast 'no-vectorisation' chain. + +On the second pass, re-index all your documents using atomic updates and targeting the 'vectorisation' chain: + +[source,json] +---- +{"id":"mydoc", + "vectorised":{"set":true} +} +---- + +What will happen is that internally Solr fetches the stored content of the docs to update, all the existing fields are retrieved and a re-indexing happens, targeting the 'vectorisation' chain that will add the vector and set the boolean 'vectorised' field to 'true'. + +Faceting or querying on the boolean 'vectorised' field can also give you a quick idea on how many documents have been enriched with vectors. + + === Running a Text-to-Vector Query -To run a query that embeds your query text, using a model you previously uploaded is simple: +To run a query that vectorises your query text, using a model you previously uploaded is simple: [source,text] ?q={!knn_text_to_vector model=a-model f=vector topK=10}hello world query diff --git a/solr/test-framework/src/java/org/apache/solr/util/RestTestBase.java b/solr/test-framework/src/java/org/apache/solr/util/RestTestBase.java index ce066c4e1eb..4b0eb19eaad 100644 --- a/solr/test-framework/src/java/org/apache/solr/util/RestTestBase.java +++ b/solr/test-framework/src/java/org/apache/solr/util/RestTestBase.java @@ -88,13 +88,33 @@ public abstract class RestTestBase extends SolrJettyTestBase { if (response != null) fail(m + "update was not successful: " + response); } else { String response = restTestHarness.validateErrorUpdate(update); - if (response != null) fail(m + "update succeeded, but should have failed: " + response); + if (response == null) fail(m + "update succeeded, but should have failed: " + response); } } catch (SAXException e) { throw new RuntimeException("Invalid XML", e); } } + public static void checkUpdateU(String update, String... tests) { + try { + String response = restTestHarness.validateUpdate(update); + String results = TestHarness.validateXPath(response, tests); + if (null != results) { + log.error( + "REQUEST FAILED: xpath={}\n\txml response was: {}\n\trequest was:{}", + results, + response, + update); + fail(results); + } + } catch (XPathExpressionException e1) { + throw new RuntimeException("XPath is invalid", e1); + } catch (Exception e2) { + log.error("REQUEST FAILED: {}", update, e2); + throw new RuntimeException("Exception during query", e2); + } + } + /** * Validates a query matches some XPath test expressions *