This is an automated email from the ASF dual-hosted git repository.
abenedetti pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new 5d554e18914 SOLR-17843: Fix TextToVectorUpdateProcessor for Partial
Updates (#3919)
5d554e18914 is described below
commit 5d554e18914529c10d510d107c63e88c2269f18d
Author: Ilaria Petreti <[email protected]>
AuthorDate: Wed Dec 17 10:53:57 2025 +0100
SOLR-17843: Fix TextToVectorUpdateProcessor for Partial Updates (#3919)
* Fixed the bug with TextToVectorUpdateProcessorTest
---
...ToVectorUpdateProcessor for partial updates.yml | 10 +++
.../processor/TextToVectorUpdateProcessor.java | 2 +-
.../{schema.xml => schema-language-models.xml} | 5 +-
.../conf/solrconfig-language-models.xml | 17 ++++++
.../search/TextToVectorQParserTest.java | 2 +-
.../store/rest/TestModelManager.java | 2 +-
.../store/rest/TestModelManagerPersistence.java | 2 +-
.../TextToVectorUpdateProcessorFactoryTest.java | 2 +-
.../processor/TextToVectorUpdateProcessorTest.java | 71 +++++++++++++++++++++-
.../modules/query-guide/pages/text-to-vector.adoc | 36 ++++++-----
10 files changed, 127 insertions(+), 22 deletions(-)
diff --git a/changelog/unreleased/SOLR-17843-fix textToVectorUpdateProcessor
for partial updates.yml b/changelog/unreleased/SOLR-17843-fix
textToVectorUpdateProcessor for partial updates.yml
new file mode 100644
index 00000000000..123800deeef
--- /dev/null
+++ b/changelog/unreleased/SOLR-17843-fix textToVectorUpdateProcessor for
partial updates.yml
@@ -0,0 +1,10 @@
+# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc
+title: Fix TextToVectorUpdateProcessor for Partial Updates
+type: fixed
+authors:
+ - name: Ilaria Petreti
+ - name: Alessandro Benedetti
+ url: https://home.apache.org/phonebook.html?uid=abenedetti
+links:
+ - name: SOLR-17843
+ url: https://issues.apache.org/jira/browse/SOLR-17843
diff --git
a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessor.java
b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessor.java
index f20d91e9685..116b4ba125b 100644
---
a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessor.java
+++
b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessor.java
@@ -69,7 +69,7 @@ class TextToVectorUpdateProcessor extends
UpdateRequestProcessor {
for (float f : vector) {
vectorAsList.add(f);
}
- doc.addField(outputField, vectorAsList);
+ doc.setField(outputField, vectorAsList);
} catch (RuntimeException vectorisationFailure) {
if (log.isErrorEnabled()) {
SchemaField uniqueKeyField = schema.getUniqueKeyField();
diff --git
a/solr/modules/language-models/src/test-files/solr/collection1/conf/schema.xml
b/solr/modules/language-models/src/test-files/solr/collection1/conf/schema-language-models.xml
similarity index 92%
rename from
solr/modules/language-models/src/test-files/solr/collection1/conf/schema.xml
rename to
solr/modules/language-models/src/test-files/solr/collection1/conf/schema-language-models.xml
index a574b93294a..ef93fbc057d 100644
---
a/solr/modules/language-models/src/test-files/solr/collection1/conf/schema.xml
+++
b/solr/modules/language-models/src/test-files/solr/collection1/conf/schema-language-models.xml
@@ -18,8 +18,9 @@
<!-- Test schema file for DenseVectorField -->
-<schema name="schema-densevector" version="1.7">
+<schema name="schema-language-models" version="1.7">
<fieldType name="string" class="solr.StrField" multiValued="true"/>
+ <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
<fieldType name="knn_vector" class="solr.DenseVectorField"
vectorDimension="4" similarityFunction="cosine" />
<fieldType name="knn_vector_byte_encoding" class="solr.DenseVectorField"
vectorDimension="4" similarityFunction="cosine" vectorEncoding="BYTE"/>
<fieldType name="high_dimensional_float_knn_vector"
class="solr.DenseVectorField" vectorDimension="2048"
similarityFunction="cosine" vectorEncoding="FLOAT32"/>
@@ -39,6 +40,8 @@
<field name="_version_" type="plong" indexed="true" stored="true"
multiValued="false" />
<field name="_text_" type="text_general" indexed="true" stored="false"
multiValued="true"/>
<copyField source="*" dest="_text_"/>
+ <field name="vectorised" type="boolean" indexed="true" stored="false"
docValues="true" default="false"/>
+
<fieldType name="text_general" class="solr.TextField"
positionIncrementGap="100" multiValued="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
diff --git
a/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-language-models.xml
b/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-language-models.xml
index f9c58a3a849..e910d911826 100644
---
a/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-language-models.xml
+++
b/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-language-models.xml
@@ -72,6 +72,23 @@
<processor class="solr.RunUpdateProcessorFactory"/>
</updateRequestProcessorChain>
+ <updateRequestProcessorChain name="textToVectorStoredInputField">
+ <processor
class="solr.languagemodels.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory">
+ <str name="inputField">string_field</str>
+ <str name="outputField">vector</str>
+ <str name="model">dummy-1</str>
+ </processor>
+ <processor class="solr.RunUpdateProcessorFactory"/>
+ </updateRequestProcessorChain>
+ <updateRequestProcessorChain name="textToVectorForPartialUpdates">
+ <processor class="solr.DistributedUpdateProcessorFactory"/>
+ <processor
class="solr.languagemodels.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory">
+ <str name="inputField">string_field</str>
+ <str name="outputField">vector</str>
+ <str name="model">dummy-1</str>
+ </processor>
+ <processor class="solr.RunUpdateProcessorFactory"/>
+ </updateRequestProcessorChain>
</config>
diff --git
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/search/TextToVectorQParserTest.java
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/search/TextToVectorQParserTest.java
index be47a160f64..95395c18565 100644
---
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/search/TextToVectorQParserTest.java
+++
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/search/TextToVectorQParserTest.java
@@ -27,7 +27,7 @@ import org.junit.Test;
public class TextToVectorQParserTest extends TestLanguageModelBase {
@BeforeClass
public static void init() throws Exception {
- setupTest("solrconfig-language-models.xml", "schema.xml", true, false);
+ setupTest("solrconfig-language-models.xml", "schema-language-models.xml",
true, false);
loadModel("dummy-model.json");
}
diff --git
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManager.java
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManager.java
index da70dbb696f..66b48884854 100644
---
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManager.java
+++
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManager.java
@@ -31,7 +31,7 @@ public class TestModelManager extends TestLanguageModelBase {
@BeforeClass
public static void init() throws Exception {
- setupTest("solrconfig-language-models.xml", "schema.xml", false, false);
+ setupTest("solrconfig-language-models.xml", "schema-language-models.xml",
false, false);
}
@AfterClass
diff --git
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManagerPersistence.java
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManagerPersistence.java
index efa90848a9c..ece13f4e1b7 100644
---
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManagerPersistence.java
+++
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManagerPersistence.java
@@ -30,7 +30,7 @@ public class TestModelManagerPersistence extends
TestLanguageModelBase {
@Before
public void init() throws Exception {
- setupTest("solrconfig-language-models.xml", "schema.xml", false, true);
+ setupTest("solrconfig-language-models.xml", "schema-language-models.xml",
false, true);
}
@After
diff --git
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorFactoryTest.java
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorFactoryTest.java
index 4185d9913b2..c8b063bd129 100644
---
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorFactoryTest.java
+++
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorFactoryTest.java
@@ -35,7 +35,7 @@ public class TextToVectorUpdateProcessorFactoryTest extends
TestLanguageModelBas
@BeforeClass
public static void init() throws Exception {
- setupTest("solrconfig-language-models.xml", "schema.xml", false, false);
+ setupTest("solrconfig-language-models.xml", "schema-language-models.xml",
false, false);
}
@AfterClass
diff --git
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorTest.java
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorTest.java
index bc7de75748b..4354c1a69e3 100644
---
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorTest.java
+++
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorTest.java
@@ -17,6 +17,7 @@
package org.apache.solr.languagemodels.textvectorisation.update.processor;
import java.io.IOException;
+import java.util.Map;
import org.apache.solr.client.solrj.RemoteSolrException;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.SolrQuery;
@@ -33,7 +34,7 @@ public class TextToVectorUpdateProcessorTest extends
TestLanguageModelBase {
@BeforeClass
public static void init() throws Exception {
- setupTest("solrconfig-language-models.xml", "schema.xml", false, false);
+ setupTest("solrconfig-language-models.xml", "schema-language-models.xml",
false, false);
}
@AfterClass
@@ -162,6 +163,74 @@ public class TextToVectorUpdateProcessorTest extends
TestLanguageModelBase {
ManagedTextToVectorModelStore.REST_END_POINT +
"/exception-throwing-model"); // clean up
}
+ @Test
+ public void
processAtomicUpdate_shouldTriggerTheVectorizationAndFetchTheStoredContent()
+ throws Exception {
+ // Verifies that when using a processor chain configured for partial
updates
+ // (i.e., the UpdateRequestProcessor is placed before the TextToVector
processor),
+ // the system correctly retrieves the stored value of the input field
(string_field)
+ // and generates the vector for the document.
+ loadModel("dummy-model.json");
+ assertU(adoc("id", "99", "string_field", "Vegeta is the saiyan prince."));
+ assertU(adoc("id", "98", "string_field", "Kakaroth is a saiyan grown up on
planet Earth."));
+ assertU(commit());
+
+ SolrInputDocument atomic_doc = new SolrInputDocument();
+ atomic_doc.setField("id", "99");
+ atomic_doc.setField("vectorised", Map.of("set", "true"));
+ addWithChain(
+ atomic_doc, "textToVectorForPartialUpdates"); // use the chain that
supports partial updates
+ assertU(commit());
+
+ final SolrQuery query = getSolrQuery();
+
+ assertJQ(
+ "/query" + query.toQueryString(),
+ "/response/numFound==2]",
+ "/response/docs/[0]/id=='98'",
+ "!/response/docs/[0]/vector==", // no vector field for document 98
+ "/response/docs/[1]/id=='99'",
+ "/response/docs/[1]/vector==[1.0, 2.0, 3.0, 4.0]");
+
+ restTestHarness.delete(ManagedTextToVectorModelStore.REST_END_POINT +
"/dummy-1");
+ }
+
+ @Test
+ public void processAtomicUpdate_shouldReplaceExistingVectorNotAppend()
throws Exception {
+ // This test verifies that when a document already contains a vector, and
the string_field field
+ // is
+ // modified using an atomic update, the vector is recomputed and replaces
the previous one. It
+ // ensures that the system does not append or merge vector values.
+ loadModel("dummy-model.json");
+ addWithChain(
+ sdoc("id", "99", "string_field", "Vegeta is the saiyan prince."),
+ "textToVectorStoredInputField");
+ addWithChain(
+ sdoc("id", "98", "string_field", "Kakaroth is a saiyan grown up on
planet Earth."),
+ "textToVectorStoredInputField");
+ assertU(commit());
+
+ SolrInputDocument atomic_doc = new SolrInputDocument();
+ atomic_doc.setField("id", "99");
+ atomic_doc.setField(
+ "string_field", Map.of("set", "Vegeta is the saiyan prince from the
Dragon Ball series."));
+ addWithChain(
+ atomic_doc, "textToVectorForPartialUpdates"); // use the chain that
supports partial updates
+ assertU(commit());
+
+ final SolrQuery query = getSolrQuery();
+
+ assertJQ(
+ "/query" + query.toQueryString(),
+ "/response/numFound==2]",
+ "/response/docs/[0]/id=='99'",
+ "/response/docs/[0]/vector==[1.0, 2.0, 3.0, 4.0]",
+ "/response/docs/[1]/id=='98'",
+ "/response/docs/[1]/vector==[1.0, 2.0, 3.0, 4.0]");
+
+ restTestHarness.delete(ManagedTextToVectorModelStore.REST_END_POINT +
"/dummy-1");
+ }
+
void addWithChain(SolrInputDocument document, String updateChain)
throws SolrServerException, IOException {
UpdateRequest req = new UpdateRequest();
diff --git a/solr/solr-ref-guide/modules/query-guide/pages/text-to-vector.adoc
b/solr/solr-ref-guide/modules/query-guide/pages/text-to-vector.adoc
index 13d845459f7..ebcaf8c7189 100644
--- a/solr/solr-ref-guide/modules/query-guide/pages/text-to-vector.adoc
+++ b/solr/solr-ref-guide/modules/query-guide/pages/text-to-vector.adoc
@@ -284,7 +284,7 @@ To vectorise textual fields of your documents at indexing
time you need to confi
<str name="model">dummy-1</str>
</processor>
<processor class="solr.RunUpdateProcessorFactory"/>
- </updateRequestProcessorChain>
+</updateRequestProcessorChain>
----
The `TextToVectorUpdateProcessor` update request processor vectorises the
content of the 'inputField' for each document processed at indexing time.
@@ -311,25 +311,25 @@ This can be done in Solr defining two update request
processors chains: one that
[source,xml]
----
<updateRequestProcessorChain name="no-vectorisation">
-<processor class="solr.processor1">
+ <processor class="solr.processor1">
...
</processor>
-...
-<processor class="solr.processorN">
+ ...
+ <processor class="solr.processorN">
...
</processor>
<processor class="solr.RunUpdateProcessorFactory"/>
- </updateRequestProcessorChain>
+</updateRequestProcessorChain>
----
[source,xml]
----
<updateRequestProcessorChain name="vectorisation">
-<processor class="solr.processor1">
+ <processor class="solr.processor1">
...
</processor>
-...
-<processor class="solr.processorN">
+ ...
+ <processor class="solr.processorN">
...
</processor>
<processor
class="solr.languagemodels.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory">
@@ -338,7 +338,7 @@ This can be done in Solr defining two update request
processors chains: one that
<str name="model">dummy-1</str>
</processor>
<processor class="solr.RunUpdateProcessorFactory"/>
- </updateRequestProcessorChain>
+</updateRequestProcessorChain>
----
You would index your documents first using the 'no-vectorisation' and when
finished, incrementally repeat the indexing targeting the 'vectorisation' chain.
@@ -356,29 +356,35 @@ You still define two chains, but this time the
'vectorisation' one only includes
[source,xml]
----
<updateRequestProcessorChain name="no-vectorisation">
-<processor class="solr.processor1">
+ <processor class="solr.processor1">
...
</processor>
-...
-<processor class="solr.processorN">
+ ...
+ <processor class="solr.processorN">
...
</processor>
<processor class="solr.RunUpdateProcessorFactory"/>
- </updateRequestProcessorChain>
+</updateRequestProcessorChain>
----
[source,xml]
----
<updateRequestProcessorChain name="vectorisation">
-<processor
class="solr.languagemodels.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory">
+ <processor class="solr.DistributedUpdateProcessorFactory"/>
+ <processor
class="solr.languagemodels.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory">
<str name="inputField">_text_</str>
<str name="outputField">vector</str>
<str name="model">dummy-1</str>
</processor>
<processor class="solr.RunUpdateProcessorFactory"/>
- </updateRequestProcessorChain>
+</updateRequestProcessorChain>
----
+[NOTE]
+====
+Since partial updates are resolved by `DistributedUpdateProcessorFactory`, be
sure to place `TextToVectorUpdateProcessorFactory` afterwards so that it sees
normal/complete documents.
+====
+
Add to your schema a simple field that will be useful to track the
vectorisation and use atomic updates:
[source,xml]