(solr) branch main updated: SOLR-17843: Fix TextToVectorUpdateProcessor for Partial Updates (#3919)

abenedetti Wed, 17 Dec 2025 01:54:09 -0800

This is an automated email from the ASF dual-hosted git repository.

abenedetti pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git



The following commit(s) were added to refs/heads/main by this push:
     new 5d554e18914 SOLR-17843: Fix TextToVectorUpdateProcessor for Partial 
Updates (#3919)
5d554e18914 is described below

commit 5d554e18914529c10d510d107c63e88c2269f18d
Author: Ilaria Petreti <[email protected]>
AuthorDate: Wed Dec 17 10:53:57 2025 +0100

    SOLR-17843: Fix TextToVectorUpdateProcessor for Partial Updates (#3919)
    
    * Fixed the bug with TextToVectorUpdateProcessorTest
---
 ...ToVectorUpdateProcessor for partial updates.yml | 10 +++
 .../processor/TextToVectorUpdateProcessor.java     |  2 +-
 .../{schema.xml => schema-language-models.xml}     |  5 +-
 .../conf/solrconfig-language-models.xml            | 17 ++++++
 .../search/TextToVectorQParserTest.java            |  2 +-
 .../store/rest/TestModelManager.java               |  2 +-
 .../store/rest/TestModelManagerPersistence.java    |  2 +-
 .../TextToVectorUpdateProcessorFactoryTest.java    |  2 +-
 .../processor/TextToVectorUpdateProcessorTest.java | 71 +++++++++++++++++++++-
 .../modules/query-guide/pages/text-to-vector.adoc  | 36 ++++++-----
 10 files changed, 127 insertions(+), 22 deletions(-)

diff --git a/changelog/unreleased/SOLR-17843-fix textToVectorUpdateProcessor 
for partial updates.yml b/changelog/unreleased/SOLR-17843-fix 
textToVectorUpdateProcessor for partial updates.yml
new file mode 100644
index 00000000000..123800deeef
--- /dev/null
+++ b/changelog/unreleased/SOLR-17843-fix textToVectorUpdateProcessor for 
partial updates.yml   
@@ -0,0 +1,10 @@
+# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc
+title: Fix TextToVectorUpdateProcessor for Partial Updates
+type: fixed
+authors:
+  - name: Ilaria Petreti
+  - name: Alessandro Benedetti
+    url: https://home.apache.org/phonebook.html?uid=abenedetti
+links:
+  - name: SOLR-17843
+    url: https://issues.apache.org/jira/browse/SOLR-17843
diff --git 
a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessor.java
 
b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessor.java
index f20d91e9685..116b4ba125b 100644
--- 
a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessor.java
+++ 
b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessor.java
@@ -69,7 +69,7 @@ class TextToVectorUpdateProcessor extends 
UpdateRequestProcessor {
         for (float f : vector) {
           vectorAsList.add(f);
         }
-        doc.addField(outputField, vectorAsList);
+        doc.setField(outputField, vectorAsList);
       } catch (RuntimeException vectorisationFailure) {
         if (log.isErrorEnabled()) {
           SchemaField uniqueKeyField = schema.getUniqueKeyField();
diff --git 
a/solr/modules/language-models/src/test-files/solr/collection1/conf/schema.xml 
b/solr/modules/language-models/src/test-files/solr/collection1/conf/schema-language-models.xml
similarity index 92%
rename from 
solr/modules/language-models/src/test-files/solr/collection1/conf/schema.xml
rename to 
solr/modules/language-models/src/test-files/solr/collection1/conf/schema-language-models.xml
index a574b93294a..ef93fbc057d 100644
--- 
a/solr/modules/language-models/src/test-files/solr/collection1/conf/schema.xml
+++ 
b/solr/modules/language-models/src/test-files/solr/collection1/conf/schema-language-models.xml
@@ -18,8 +18,9 @@
 
 <!-- Test schema file for DenseVectorField -->
 
-<schema name="schema-densevector" version="1.7">
+<schema name="schema-language-models" version="1.7">
   <fieldType name="string" class="solr.StrField" multiValued="true"/>
+  <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
   <fieldType name="knn_vector" class="solr.DenseVectorField" 
vectorDimension="4" similarityFunction="cosine" />
   <fieldType name="knn_vector_byte_encoding" class="solr.DenseVectorField" 
vectorDimension="4" similarityFunction="cosine" vectorEncoding="BYTE"/>
   <fieldType name="high_dimensional_float_knn_vector" 
class="solr.DenseVectorField" vectorDimension="2048" 
similarityFunction="cosine" vectorEncoding="FLOAT32"/>
@@ -39,6 +40,8 @@
   <field name="_version_" type="plong" indexed="true" stored="true" 
multiValued="false" />
   <field name="_text_" type="text_general" indexed="true" stored="false" 
multiValued="true"/>
   <copyField source="*" dest="_text_"/>
+  <field name="vectorised" type="boolean" indexed="true" stored="false" 
docValues="true" default="false"/>
+
   <fieldType name="text_general" class="solr.TextField" 
positionIncrementGap="100" multiValued="true">
     <analyzer type="index">
       <tokenizer class="solr.StandardTokenizerFactory"/>
diff --git 
a/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-language-models.xml
 
b/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-language-models.xml
index f9c58a3a849..e910d911826 100644
--- 
a/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-language-models.xml
+++ 
b/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-language-models.xml
@@ -72,6 +72,23 @@
   <processor class="solr.RunUpdateProcessorFactory"/>
  </updateRequestProcessorChain>
 
+  <updateRequestProcessorChain name="textToVectorStoredInputField">
+    <processor 
class="solr.languagemodels.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory">
+      <str name="inputField">string_field</str>
+      <str name="outputField">vector</str>
+      <str name="model">dummy-1</str>
+    </processor>
+    <processor class="solr.RunUpdateProcessorFactory"/>
+  </updateRequestProcessorChain>
 
+  <updateRequestProcessorChain name="textToVectorForPartialUpdates">
+    <processor class="solr.DistributedUpdateProcessorFactory"/>
+    <processor 
class="solr.languagemodels.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory">
+      <str name="inputField">string_field</str>
+      <str name="outputField">vector</str>
+      <str name="model">dummy-1</str>
+    </processor>
+    <processor class="solr.RunUpdateProcessorFactory"/>
+  </updateRequestProcessorChain>
 
 </config>
diff --git 
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/search/TextToVectorQParserTest.java
 
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/search/TextToVectorQParserTest.java
index be47a160f64..95395c18565 100644
--- 
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/search/TextToVectorQParserTest.java
+++ 
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/search/TextToVectorQParserTest.java
@@ -27,7 +27,7 @@ import org.junit.Test;
 public class TextToVectorQParserTest extends TestLanguageModelBase {
   @BeforeClass
   public static void init() throws Exception {
-    setupTest("solrconfig-language-models.xml", "schema.xml", true, false);
+    setupTest("solrconfig-language-models.xml", "schema-language-models.xml", 
true, false);
     loadModel("dummy-model.json");
   }
 
diff --git 
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManager.java
 
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManager.java
index da70dbb696f..66b48884854 100644
--- 
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManager.java
+++ 
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManager.java
@@ -31,7 +31,7 @@ public class TestModelManager extends TestLanguageModelBase {
 
   @BeforeClass
   public static void init() throws Exception {
-    setupTest("solrconfig-language-models.xml", "schema.xml", false, false);
+    setupTest("solrconfig-language-models.xml", "schema-language-models.xml", 
false, false);
   }
 
   @AfterClass
diff --git 
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManagerPersistence.java
 
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManagerPersistence.java
index efa90848a9c..ece13f4e1b7 100644
--- 
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManagerPersistence.java
+++ 
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/store/rest/TestModelManagerPersistence.java
@@ -30,7 +30,7 @@ public class TestModelManagerPersistence extends 
TestLanguageModelBase {
 
   @Before
   public void init() throws Exception {
-    setupTest("solrconfig-language-models.xml", "schema.xml", false, true);
+    setupTest("solrconfig-language-models.xml", "schema-language-models.xml", 
false, true);
   }
 
   @After
diff --git 
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorFactoryTest.java
 
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorFactoryTest.java
index 4185d9913b2..c8b063bd129 100644
--- 
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorFactoryTest.java
+++ 
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorFactoryTest.java
@@ -35,7 +35,7 @@ public class TextToVectorUpdateProcessorFactoryTest extends 
TestLanguageModelBas
 
   @BeforeClass
   public static void init() throws Exception {
-    setupTest("solrconfig-language-models.xml", "schema.xml", false, false);
+    setupTest("solrconfig-language-models.xml", "schema-language-models.xml", 
false, false);
   }
 
   @AfterClass
diff --git 
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorTest.java
 
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorTest.java
index bc7de75748b..4354c1a69e3 100644
--- 
a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorTest.java
+++ 
b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/textvectorisation/update/processor/TextToVectorUpdateProcessorTest.java
@@ -17,6 +17,7 @@
 package org.apache.solr.languagemodels.textvectorisation.update.processor;
 
 import java.io.IOException;
+import java.util.Map;
 import org.apache.solr.client.solrj.RemoteSolrException;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.request.SolrQuery;
@@ -33,7 +34,7 @@ public class TextToVectorUpdateProcessorTest extends 
TestLanguageModelBase {
 
   @BeforeClass
   public static void init() throws Exception {
-    setupTest("solrconfig-language-models.xml", "schema.xml", false, false);
+    setupTest("solrconfig-language-models.xml", "schema-language-models.xml", 
false, false);
   }
 
   @AfterClass
@@ -162,6 +163,74 @@ public class TextToVectorUpdateProcessorTest extends 
TestLanguageModelBase {
         ManagedTextToVectorModelStore.REST_END_POINT + 
"/exception-throwing-model"); // clean up
   }
 
+  @Test
+  public void 
processAtomicUpdate_shouldTriggerTheVectorizationAndFetchTheStoredContent()
+      throws Exception {
+    // Verifies that when using a processor chain configured for partial 
updates
+    // (i.e., the UpdateRequestProcessor is placed before the TextToVector 
processor),
+    // the system correctly retrieves the stored value of the input field 
(string_field)
+    // and generates the vector for the document.
+    loadModel("dummy-model.json");
+    assertU(adoc("id", "99", "string_field", "Vegeta is the saiyan prince."));
+    assertU(adoc("id", "98", "string_field", "Kakaroth is a saiyan grown up on 
planet Earth."));
+    assertU(commit());
+
+    SolrInputDocument atomic_doc = new SolrInputDocument();
+    atomic_doc.setField("id", "99");
+    atomic_doc.setField("vectorised", Map.of("set", "true"));
+    addWithChain(
+        atomic_doc, "textToVectorForPartialUpdates"); // use the chain that 
supports partial updates
+    assertU(commit());
+
+    final SolrQuery query = getSolrQuery();
+
+    assertJQ(
+        "/query" + query.toQueryString(),
+        "/response/numFound==2]",
+        "/response/docs/[0]/id=='98'",
+        "!/response/docs/[0]/vector==", // no vector field for document 98
+        "/response/docs/[1]/id=='99'",
+        "/response/docs/[1]/vector==[1.0, 2.0, 3.0, 4.0]");
+
+    restTestHarness.delete(ManagedTextToVectorModelStore.REST_END_POINT + 
"/dummy-1");
+  }
+
+  @Test
+  public void processAtomicUpdate_shouldReplaceExistingVectorNotAppend() 
throws Exception {
+    // This test verifies that when a document already contains a vector, and 
the string_field field
+    // is
+    // modified using an atomic update, the vector is recomputed and replaces 
the previous one. It
+    // ensures that the system does not append or merge vector values.
+    loadModel("dummy-model.json");
+    addWithChain(
+        sdoc("id", "99", "string_field", "Vegeta is the saiyan prince."),
+        "textToVectorStoredInputField");
+    addWithChain(
+        sdoc("id", "98", "string_field", "Kakaroth is a saiyan grown up on 
planet Earth."),
+        "textToVectorStoredInputField");
+    assertU(commit());
+
+    SolrInputDocument atomic_doc = new SolrInputDocument();
+    atomic_doc.setField("id", "99");
+    atomic_doc.setField(
+        "string_field", Map.of("set", "Vegeta is the saiyan prince from the 
Dragon Ball series."));
+    addWithChain(
+        atomic_doc, "textToVectorForPartialUpdates"); // use the chain that 
supports partial updates
+    assertU(commit());
+
+    final SolrQuery query = getSolrQuery();
+
+    assertJQ(
+        "/query" + query.toQueryString(),
+        "/response/numFound==2]",
+        "/response/docs/[0]/id=='99'",
+        "/response/docs/[0]/vector==[1.0, 2.0, 3.0, 4.0]",
+        "/response/docs/[1]/id=='98'",
+        "/response/docs/[1]/vector==[1.0, 2.0, 3.0, 4.0]");
+
+    restTestHarness.delete(ManagedTextToVectorModelStore.REST_END_POINT + 
"/dummy-1");
+  }
+
   void addWithChain(SolrInputDocument document, String updateChain)
       throws SolrServerException, IOException {
     UpdateRequest req = new UpdateRequest();
diff --git a/solr/solr-ref-guide/modules/query-guide/pages/text-to-vector.adoc 
b/solr/solr-ref-guide/modules/query-guide/pages/text-to-vector.adoc
index 13d845459f7..ebcaf8c7189 100644
--- a/solr/solr-ref-guide/modules/query-guide/pages/text-to-vector.adoc
+++ b/solr/solr-ref-guide/modules/query-guide/pages/text-to-vector.adoc
@@ -284,7 +284,7 @@ To vectorise textual fields of your documents at indexing 
time you need to confi
    <str name="model">dummy-1</str>
   </processor>
   <processor class="solr.RunUpdateProcessorFactory"/>
- </updateRequestProcessorChain>
+</updateRequestProcessorChain>
 ----
 
 The `TextToVectorUpdateProcessor` update request processor vectorises the 
content of the 'inputField' for each document processed at indexing time.
@@ -311,25 +311,25 @@ This can be done in Solr defining two update request 
processors chains: one that
 [source,xml]
 ----
 <updateRequestProcessorChain name="no-vectorisation">
-<processor class="solr.processor1">
+  <processor class="solr.processor1">
    ...
   </processor>
-...
-<processor class="solr.processorN">
+   ...
+  <processor class="solr.processorN">
    ...
   </processor>
   <processor class="solr.RunUpdateProcessorFactory"/>
- </updateRequestProcessorChain>
+</updateRequestProcessorChain>
 ----
 
 [source,xml]
 ----
 <updateRequestProcessorChain name="vectorisation">
-<processor class="solr.processor1">
+  <processor class="solr.processor1">
    ...
   </processor>
-...
-<processor class="solr.processorN">
+   ...
+  <processor class="solr.processorN">
    ...
   </processor>
 <processor 
class="solr.languagemodels.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory">
@@ -338,7 +338,7 @@ This can be done in Solr defining two update request 
processors chains: one that
    <str name="model">dummy-1</str>
   </processor>
   <processor class="solr.RunUpdateProcessorFactory"/>
- </updateRequestProcessorChain>
+</updateRequestProcessorChain>
 ----
 
 You would index your documents first using the 'no-vectorisation' and when 
finished, incrementally repeat the indexing targeting the 'vectorisation' chain.
@@ -356,29 +356,35 @@ You still define two chains, but this time the 
'vectorisation' one only includes
 [source,xml]
 ----
 <updateRequestProcessorChain name="no-vectorisation">
-<processor class="solr.processor1">
+  <processor class="solr.processor1">
    ...
   </processor>
-...
-<processor class="solr.processorN">
+   ...
+  <processor class="solr.processorN">
    ...
   </processor>
   <processor class="solr.RunUpdateProcessorFactory"/>
- </updateRequestProcessorChain>
+</updateRequestProcessorChain>
 ----
 
 [source,xml]
 ----
 <updateRequestProcessorChain name="vectorisation">
-<processor 
class="solr.languagemodels.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory">
+  <processor class="solr.DistributedUpdateProcessorFactory"/>
+  <processor 
class="solr.languagemodels.textvectorisation.update.processor.TextToVectorUpdateProcessorFactory">
    <str name="inputField">_text_</str>
    <str name="outputField">vector</str>
    <str name="model">dummy-1</str>
   </processor>
   <processor class="solr.RunUpdateProcessorFactory"/>
- </updateRequestProcessorChain>
+</updateRequestProcessorChain>
 ----
 
+[NOTE]
+====
+Since partial updates are resolved by `DistributedUpdateProcessorFactory`, be 
sure to place `TextToVectorUpdateProcessorFactory` afterwards so that it sees 
normal/complete documents.
+====
+
 Add to your schema a simple field that will be useful to track the 
vectorisation and use atomic updates:
 
 [source,xml]

(solr) branch main updated: SOLR-17843: Fix TextToVectorUpdateProcessor for Partial Updates (#3919)

Reply via email to