This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 6af551894e improve jina-integration (#2813)
6af551894e is described below

commit 6af551894ede0656b6131ec038bc5fd12a6627e8
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 14 12:36:35 2026 -0400

    improve jina-integration (#2813)
---
 .../tika/inference/AbstractEmbeddingFilter.java    |   2 +-
 .../org/apache/tika/inference/ChunkSerializer.java |  11 +-
 .../apache/tika/inference/JinaEmbeddingFilter.java |  91 ++++++++++++
 .../tika/inference/OpenAIImageEmbeddingParser.java |  10 +-
 .../tika/inference/JinaEmbeddingFilterTest.java    | 159 +++++++++++++++++++++
 5 files changed, 269 insertions(+), 4 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/AbstractEmbeddingFilter.java
 
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/AbstractEmbeddingFilter.java
index 5d5cf78282..a89573c29b 100644
--- 
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/AbstractEmbeddingFilter.java
+++ 
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/AbstractEmbeddingFilter.java
@@ -143,7 +143,7 @@ public abstract class AbstractEmbeddingFilter extends 
MetadataFilter {
                         i, Math.min(i + batchSize, chunks.size()));
                 embed(batch, defaultConfig);
             }
-            ChunkSerializer.mergeInto(metadata, chunks);
+            ChunkSerializer.mergeInto(metadata, chunks, 
defaultConfig.getOutputField());
         } catch (IOException e) {
             throw new TikaException(
                     "Embedding inference failed: " + e.getMessage(), e);
diff --git 
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/ChunkSerializer.java
 
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/ChunkSerializer.java
index 95e3234884..84ecafb806 100644
--- 
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/ChunkSerializer.java
+++ 
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/ChunkSerializer.java
@@ -76,15 +76,22 @@ public final class ChunkSerializer {
     public static void mergeInto(
             org.apache.tika.metadata.Metadata metadata,
             List<Chunk> newChunks) throws IOException {
+        mergeInto(metadata, newChunks, TikaCoreProperties.TIKA_CHUNKS);
+    }
+
+    public static void mergeInto(
+            org.apache.tika.metadata.Metadata metadata,
+            List<Chunk> newChunks,
+            String fieldName) throws IOException {
         List<Chunk> existing;
-        String current = metadata.get(TikaCoreProperties.TIKA_CHUNKS);
+        String current = metadata.get(fieldName);
         if (current != null && !current.isEmpty()) {
             existing = fromJson(current);
         } else {
             existing = new ArrayList<>();
         }
         existing.addAll(newChunks);
-        metadata.set(TikaCoreProperties.TIKA_CHUNKS, toJson(existing));
+        metadata.set(fieldName, toJson(existing));
     }
 
     /**
diff --git 
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/JinaEmbeddingFilter.java
 
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/JinaEmbeddingFilter.java
new file mode 100644
index 0000000000..aacd32447b
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/JinaEmbeddingFilter.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.inference;
+
+import java.util.List;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * Extends {@link OpenAIEmbeddingFilter} for
+ * <a href="https://jina.ai/embeddings/";>Jina AI v5 text embeddings</a>.
+ * <p>
+ * The only difference from the standard OpenAI format is an optional
+ * {@code "task"} field in the request body that instructs the Jina model
+ * how to optimise the embedding.  Supported values include
+ * {@code retrieval.passage} (default, for indexing documents),
+ * {@code retrieval.query} (for query-time embeddings),
+ * {@code text-matching}, {@code classification}, and {@code separation}.
+ * <p>
+ * Configuration key: {@code "jina-embedding-filter"}
+ *
+ * @since Apache Tika 4.0
+ */
+@TikaComponent(name = "jina-embedding-filter", spi = false)
+public class JinaEmbeddingFilter extends OpenAIEmbeddingFilter {
+
+    private static final long serialVersionUID = 1L;
+
+    private static final ObjectMapper MAPPER = new ObjectMapper();
+
+    /**
+     * Jina task type.  Default: {@code retrieval.passage} (for indexing).
+     * Set to {@code retrieval.query} when embedding search queries.
+     */
+    private String task = "retrieval.passage";
+
+    public JinaEmbeddingFilter() {
+        super();
+    }
+
+    public JinaEmbeddingFilter(InferenceConfig config) {
+        super(config);
+    }
+
+    @Override
+    String buildRequest(List<Chunk> chunks, InferenceConfig config) {
+        ObjectNode root = MAPPER.createObjectNode();
+        if (!StringUtils.isBlank(config.getModel())) {
+            root.put("model", config.getModel());
+        }
+        if (!StringUtils.isBlank(task)) {
+            root.put("task", task);
+        }
+        ArrayNode input = root.putArray("input");
+        for (Chunk chunk : chunks) {
+            input.add(chunk.getText());
+        }
+        return root.toString();
+    }
+
+    public String getTask() {
+        return task;
+    }
+
+    /**
+     * Set the Jina task type.  Default is {@code retrieval.passage}.
+     * Use {@code retrieval.query} when embedding search queries.
+     */
+    public void setTask(String task) {
+        this.task = task;
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
 
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
index ea771df89e..b186d5465e 100644
--- 
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
+++ 
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
@@ -177,7 +177,7 @@ public class OpenAIImageEmbeddingParser implements Parser, 
Initializable, Closea
         Chunk chunk = new Chunk(null, locators);
         chunk.setVector(vector);
 
-        ChunkSerializer.mergeInto(metadata, List.of(chunk));
+        ChunkSerializer.mergeInto(metadata, List.of(chunk), 
config.getOutputField());
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(
                 handler, metadata, parseContext);
@@ -371,6 +371,14 @@ public class OpenAIImageEmbeddingParser implements Parser, 
Initializable, Closea
         defaultConfig.setMaxFileSizeToEmbed(maxFileSizeToEmbed);
     }
 
+    public String getOutputField() {
+        return defaultConfig.getOutputField();
+    }
+
+    public void setOutputField(String outputField) {
+        defaultConfig.setOutputField(outputField);
+    }
+
     // ---- Azure / endpoint config getters/setters 
----------------------------
 
     public String getEmbeddingsPath() {
diff --git 
a/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/JinaEmbeddingFilterTest.java
 
b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/JinaEmbeddingFilterTest.java
new file mode 100644
index 0000000000..803eb48523
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/JinaEmbeddingFilterTest.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.inference;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.http.TikaTestHttpServer;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+public class JinaEmbeddingFilterTest {
+
+    private static final ObjectMapper MAPPER = new ObjectMapper();
+
+    private TikaTestHttpServer server;
+    private JinaEmbeddingFilter filter;
+    private InferenceConfig config;
+
+    @BeforeEach
+    void setUp() throws Exception {
+        server = new TikaTestHttpServer();
+
+        config = new InferenceConfig();
+        config.setBaseUrl(server.url());
+        config.setModel("jina-embeddings-v3");
+        config.setMaxChunkChars(500);
+        config.setOverlapChars(0);
+        config.setTimeoutSeconds(10);
+
+        filter = new JinaEmbeddingFilter(config);
+    }
+
+    @AfterEach
+    void tearDown() {
+        server.shutdown();
+    }
+
+    @Test
+    void testDefaultTaskInRequest() throws Exception {
+        server.enqueue(new TikaTestHttpServer.MockResponse(200,
+                buildEmbeddingResponse(1, 3)));
+
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.TIKA_CONTENT.getName(), "Some document 
text.");
+        List<Metadata> list = new ArrayList<>();
+        list.add(metadata);
+        filter.filter(list);
+
+        TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+        JsonNode body = MAPPER.readTree(request.body());
+        assertEquals("retrieval.passage", body.get("task").asText(),
+                "Default task should be retrieval.passage");
+        assertEquals("jina-embeddings-v3", body.get("model").asText());
+    }
+
+    @Test
+    void testCustomTask() throws Exception {
+        filter.setTask("retrieval.query");
+        server.enqueue(new TikaTestHttpServer.MockResponse(200,
+                buildEmbeddingResponse(1, 3)));
+
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.TIKA_CONTENT.getName(), "What is 
Tika?");
+        List<Metadata> list = new ArrayList<>();
+        list.add(metadata);
+        filter.filter(list);
+
+        TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+        JsonNode body = MAPPER.readTree(request.body());
+        assertEquals("retrieval.query", body.get("task").asText());
+    }
+
+    @Test
+    void testBuildRequestJsonShape() {
+        List<Chunk> chunks = List.of(
+                new Chunk("Hello", 0, 5),
+                new Chunk("World", 6, 11));
+        String json = filter.buildRequest(chunks, config);
+
+        assertTrue(json.contains("\"task\":\"retrieval.passage\""),
+                "Should include task field: " + json);
+        assertTrue(json.contains("\"model\":\"jina-embeddings-v3\""),
+                "Should include model field: " + json);
+        assertTrue(json.contains("\"Hello\""), "Should include first chunk");
+        assertTrue(json.contains("\"World\""), "Should include second chunk");
+    }
+
+    @Test
+    void testEndToEnd() throws Exception {
+        server.enqueue(new TikaTestHttpServer.MockResponse(200,
+                buildEmbeddingResponse(2, 4)));
+
+        String content = "# Introduction\n\nFirst section text.\n\n"
+                + "# Background\n\nSecond section text.";
+
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.TIKA_CONTENT.getName(), content);
+        List<Metadata> list = new ArrayList<>();
+        list.add(metadata);
+        filter.filter(list);
+
+        String chunksJson = metadata.get("tika:chunks");
+        assertNotNull(chunksJson, "Should have tika:chunks");
+
+        List<Chunk> chunks = ChunkSerializer.fromJson(chunksJson);
+        assertEquals(2, chunks.size());
+        assertNotNull(chunks.get(0).getVector());
+        assertNotNull(chunks.get(1).getVector());
+        assertEquals(4, chunks.get(0).getVector().length);
+    }
+
+    private String buildEmbeddingResponse(int numVectors, int dims) {
+        StringBuilder sb = new StringBuilder();
+        sb.append("{\"object\":\"list\",\"data\":[");
+        for (int i = 0; i < numVectors; i++) {
+            if (i > 0) {
+                sb.append(",");
+            }
+            sb.append("{\"object\":\"embedding\",\"index\":").append(i);
+            sb.append(",\"embedding\":[");
+            for (int d = 0; d < dims; d++) {
+                if (d > 0) {
+                    sb.append(",");
+                }
+                sb.append(String.format(java.util.Locale.ROOT,
+                        "%.6f", (i + 1) * 0.1 + d * 0.01));
+            }
+            sb.append("]}");
+        }
+        sb.append("],\"model\":\"jina-embeddings-v3\",");
+        sb.append("\"usage\":{\"prompt_tokens\":10,\"total_tokens\":10}}");
+        return sb.toString();
+    }
+}

Reply via email to