This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 6af551894e improve jina-integration (#2813)
6af551894e is described below
commit 6af551894ede0656b6131ec038bc5fd12a6627e8
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 14 12:36:35 2026 -0400
improve jina-integration (#2813)
---
.../tika/inference/AbstractEmbeddingFilter.java | 2 +-
.../org/apache/tika/inference/ChunkSerializer.java | 11 +-
.../apache/tika/inference/JinaEmbeddingFilter.java | 91 ++++++++++++
.../tika/inference/OpenAIImageEmbeddingParser.java | 10 +-
.../tika/inference/JinaEmbeddingFilterTest.java | 159 +++++++++++++++++++++
5 files changed, 269 insertions(+), 4 deletions(-)
diff --git
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/AbstractEmbeddingFilter.java
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/AbstractEmbeddingFilter.java
index 5d5cf78282..a89573c29b 100644
---
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/AbstractEmbeddingFilter.java
+++
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/AbstractEmbeddingFilter.java
@@ -143,7 +143,7 @@ public abstract class AbstractEmbeddingFilter extends
MetadataFilter {
i, Math.min(i + batchSize, chunks.size()));
embed(batch, defaultConfig);
}
- ChunkSerializer.mergeInto(metadata, chunks);
+ ChunkSerializer.mergeInto(metadata, chunks,
defaultConfig.getOutputField());
} catch (IOException e) {
throw new TikaException(
"Embedding inference failed: " + e.getMessage(), e);
diff --git
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/ChunkSerializer.java
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/ChunkSerializer.java
index 95e3234884..84ecafb806 100644
---
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/ChunkSerializer.java
+++
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/ChunkSerializer.java
@@ -76,15 +76,22 @@ public final class ChunkSerializer {
public static void mergeInto(
org.apache.tika.metadata.Metadata metadata,
List<Chunk> newChunks) throws IOException {
+ mergeInto(metadata, newChunks, TikaCoreProperties.TIKA_CHUNKS);
+ }
+
+ public static void mergeInto(
+ org.apache.tika.metadata.Metadata metadata,
+ List<Chunk> newChunks,
+ String fieldName) throws IOException {
List<Chunk> existing;
- String current = metadata.get(TikaCoreProperties.TIKA_CHUNKS);
+ String current = metadata.get(fieldName);
if (current != null && !current.isEmpty()) {
existing = fromJson(current);
} else {
existing = new ArrayList<>();
}
existing.addAll(newChunks);
- metadata.set(TikaCoreProperties.TIKA_CHUNKS, toJson(existing));
+ metadata.set(fieldName, toJson(existing));
}
/**
diff --git
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/JinaEmbeddingFilter.java
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/JinaEmbeddingFilter.java
new file mode 100644
index 0000000000..aacd32447b
--- /dev/null
+++
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/JinaEmbeddingFilter.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.inference;
+
+import java.util.List;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * Extends {@link OpenAIEmbeddingFilter} for
+ * <a href="https://jina.ai/embeddings/">Jina AI v5 text embeddings</a>.
+ * <p>
+ * The only difference from the standard OpenAI format is an optional
+ * {@code "task"} field in the request body that instructs the Jina model
+ * how to optimise the embedding. Supported values include
+ * {@code retrieval.passage} (default, for indexing documents),
+ * {@code retrieval.query} (for query-time embeddings),
+ * {@code text-matching}, {@code classification}, and {@code separation}.
+ * <p>
+ * Configuration key: {@code "jina-embedding-filter"}
+ *
+ * @since Apache Tika 4.0
+ */
+@TikaComponent(name = "jina-embedding-filter", spi = false)
+public class JinaEmbeddingFilter extends OpenAIEmbeddingFilter {
+
+ private static final long serialVersionUID = 1L;
+
+ private static final ObjectMapper MAPPER = new ObjectMapper();
+
+ /**
+ * Jina task type. Default: {@code retrieval.passage} (for indexing).
+ * Set to {@code retrieval.query} when embedding search queries.
+ */
+ private String task = "retrieval.passage";
+
+ public JinaEmbeddingFilter() {
+ super();
+ }
+
+ public JinaEmbeddingFilter(InferenceConfig config) {
+ super(config);
+ }
+
+ @Override
+ String buildRequest(List<Chunk> chunks, InferenceConfig config) {
+ ObjectNode root = MAPPER.createObjectNode();
+ if (!StringUtils.isBlank(config.getModel())) {
+ root.put("model", config.getModel());
+ }
+ if (!StringUtils.isBlank(task)) {
+ root.put("task", task);
+ }
+ ArrayNode input = root.putArray("input");
+ for (Chunk chunk : chunks) {
+ input.add(chunk.getText());
+ }
+ return root.toString();
+ }
+
+ public String getTask() {
+ return task;
+ }
+
+ /**
+ * Set the Jina task type. Default is {@code retrieval.passage}.
+ * Use {@code retrieval.query} when embedding search queries.
+ */
+ public void setTask(String task) {
+ this.task = task;
+ }
+}
diff --git
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
index ea771df89e..b186d5465e 100644
---
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
+++
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
@@ -177,7 +177,7 @@ public class OpenAIImageEmbeddingParser implements Parser,
Initializable, Closea
Chunk chunk = new Chunk(null, locators);
chunk.setVector(vector);
- ChunkSerializer.mergeInto(metadata, List.of(chunk));
+ ChunkSerializer.mergeInto(metadata, List.of(chunk),
config.getOutputField());
XHTMLContentHandler xhtml = new XHTMLContentHandler(
handler, metadata, parseContext);
@@ -371,6 +371,14 @@ public class OpenAIImageEmbeddingParser implements Parser,
Initializable, Closea
defaultConfig.setMaxFileSizeToEmbed(maxFileSizeToEmbed);
}
+ public String getOutputField() {
+ return defaultConfig.getOutputField();
+ }
+
+ public void setOutputField(String outputField) {
+ defaultConfig.setOutputField(outputField);
+ }
+
// ---- Azure / endpoint config getters/setters
----------------------------
public String getEmbeddingsPath() {
diff --git
a/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/JinaEmbeddingFilterTest.java
b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/JinaEmbeddingFilterTest.java
new file mode 100644
index 0000000000..803eb48523
--- /dev/null
+++
b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/JinaEmbeddingFilterTest.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.inference;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.http.TikaTestHttpServer;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+public class JinaEmbeddingFilterTest {
+
+ private static final ObjectMapper MAPPER = new ObjectMapper();
+
+ private TikaTestHttpServer server;
+ private JinaEmbeddingFilter filter;
+ private InferenceConfig config;
+
+ @BeforeEach
+ void setUp() throws Exception {
+ server = new TikaTestHttpServer();
+
+ config = new InferenceConfig();
+ config.setBaseUrl(server.url());
+ config.setModel("jina-embeddings-v3");
+ config.setMaxChunkChars(500);
+ config.setOverlapChars(0);
+ config.setTimeoutSeconds(10);
+
+ filter = new JinaEmbeddingFilter(config);
+ }
+
+ @AfterEach
+ void tearDown() {
+ server.shutdown();
+ }
+
+ @Test
+ void testDefaultTaskInRequest() throws Exception {
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildEmbeddingResponse(1, 3)));
+
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.TIKA_CONTENT.getName(), "Some document
text.");
+ List<Metadata> list = new ArrayList<>();
+ list.add(metadata);
+ filter.filter(list);
+
+ TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+ JsonNode body = MAPPER.readTree(request.body());
+ assertEquals("retrieval.passage", body.get("task").asText(),
+ "Default task should be retrieval.passage");
+ assertEquals("jina-embeddings-v3", body.get("model").asText());
+ }
+
+ @Test
+ void testCustomTask() throws Exception {
+ filter.setTask("retrieval.query");
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildEmbeddingResponse(1, 3)));
+
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.TIKA_CONTENT.getName(), "What is
Tika?");
+ List<Metadata> list = new ArrayList<>();
+ list.add(metadata);
+ filter.filter(list);
+
+ TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+ JsonNode body = MAPPER.readTree(request.body());
+ assertEquals("retrieval.query", body.get("task").asText());
+ }
+
+ @Test
+ void testBuildRequestJsonShape() {
+ List<Chunk> chunks = List.of(
+ new Chunk("Hello", 0, 5),
+ new Chunk("World", 6, 11));
+ String json = filter.buildRequest(chunks, config);
+
+ assertTrue(json.contains("\"task\":\"retrieval.passage\""),
+ "Should include task field: " + json);
+ assertTrue(json.contains("\"model\":\"jina-embeddings-v3\""),
+ "Should include model field: " + json);
+ assertTrue(json.contains("\"Hello\""), "Should include first chunk");
+ assertTrue(json.contains("\"World\""), "Should include second chunk");
+ }
+
+ @Test
+ void testEndToEnd() throws Exception {
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildEmbeddingResponse(2, 4)));
+
+ String content = "# Introduction\n\nFirst section text.\n\n"
+ + "# Background\n\nSecond section text.";
+
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.TIKA_CONTENT.getName(), content);
+ List<Metadata> list = new ArrayList<>();
+ list.add(metadata);
+ filter.filter(list);
+
+ String chunksJson = metadata.get("tika:chunks");
+ assertNotNull(chunksJson, "Should have tika:chunks");
+
+ List<Chunk> chunks = ChunkSerializer.fromJson(chunksJson);
+ assertEquals(2, chunks.size());
+ assertNotNull(chunks.get(0).getVector());
+ assertNotNull(chunks.get(1).getVector());
+ assertEquals(4, chunks.get(0).getVector().length);
+ }
+
+ private String buildEmbeddingResponse(int numVectors, int dims) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("{\"object\":\"list\",\"data\":[");
+ for (int i = 0; i < numVectors; i++) {
+ if (i > 0) {
+ sb.append(",");
+ }
+ sb.append("{\"object\":\"embedding\",\"index\":").append(i);
+ sb.append(",\"embedding\":[");
+ for (int d = 0; d < dims; d++) {
+ if (d > 0) {
+ sb.append(",");
+ }
+ sb.append(String.format(java.util.Locale.ROOT,
+ "%.6f", (i + 1) * 0.1 + d * 0.01));
+ }
+ sb.append("]}");
+ }
+ sb.append("],\"model\":\"jina-embeddings-v3\",");
+ sb.append("\"usage\":{\"prompt_tokens\":10,\"total_tokens\":10}}");
+ return sb.toString();
+ }
+}