This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4673--add-hooks-for-jina-reader in repository https://gitbox.apache.org/repos/asf/tika.git
commit dad1e966873fd24ebefb368033ec92c05e7bb6ac Author: tballison <[email protected]> AuthorDate: Mon Feb 23 12:17:38 2026 -0500 TIKA-4673 -- add a parser to wrap Jina Reader --- tika-parent/pom.xml | 39 +- tika-parsers/tika-parsers-extended/pom.xml | 4 +- .../tika-parser-jina-reader-module}/pom.xml | 38 +- .../apache/tika/parser/jina/JinaReaderConfig.java | 83 +++++ .../apache/tika/parser/jina/JinaReaderParser.java | 261 +++++++++++++ .../tika/parser/jina/MarkdownToXHTMLEmitter.java | 409 +++++++++++++++++++++ .../tika/parser/jina/JinaReaderParserTest.java | 193 ++++++++++ .../tika-parser-jina-reader-package/pom.xml | 109 ++++++ .../tika-parser-vlm-ocr-module/pom.xml | 27 -- 9 files changed, 1100 insertions(+), 63 deletions(-) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 0ee3afa58e..4758bc3a39 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -387,8 +387,8 @@ (only on the CI, not on local Windows with Docker, see comment in TIKA-4327 on 14.12.2024) expecting org.eclipse.jetty.client.util.InputStreamResponseListener which is only available in Jetty up to 11.0.26 - but this class is now in org.eclipse.jetty.client, see also - https://jetty.org/docs/jetty/12/programming-guide/migration/11-to-12.html + but this class is now in org.eclipse.jetty.client, see also + https://jetty.org/docs/jetty/12/programming-guide/migration/11-to-12.html when updating, see also TODO in PipesBiDirectionalStreamingIntegrationTest and add jakarta.servlet jakarta.servlet-api 6.0.0 to tika-server-core --> @@ -457,6 +457,11 @@ <nimbus-jose-jwt.version>10.8</nimbus-jose-jwt.version> <javacpp.version>1.5.12</javacpp.version> <maven.exec.version>3.6.3</maven.exec.version> + <okhttp.version>4.12.0</okhttp.version> + <!-- kotlin-stdlib-jdk7/jdk8 are Kotlin's own legacy artifact names (shims since Kotlin 1.8); + they are not a JDK target. Pinning here resolves enforcer convergence failures when + OkHttp resolves different patch versions of these shims via different transitive paths. --> + <kotlin.version>1.9.10</kotlin.version> <!-- needs to exist even if empty due to problems with jacoco-maven-plugin --> <addmod/> @@ -1144,6 +1149,36 @@ <artifactId>nimbus-jose-jwt</artifactId> <version>${nimbus-jose-jwt.version}</version> </dependency> + <dependency> + <groupId>com.squareup.okhttp3</groupId> + <artifactId>okhttp</artifactId> + <version>${okhttp.version}</version> + </dependency> + <dependency> + <groupId>com.squareup.okhttp3</groupId> + <artifactId>mockwebserver</artifactId> + <version>${okhttp.version}</version> + </dependency> + <dependency> + <groupId>org.jetbrains.kotlin</groupId> + <artifactId>kotlin-stdlib</artifactId> + <version>${kotlin.version}</version> + </dependency> + <dependency> + <groupId>org.jetbrains.kotlin</groupId> + <artifactId>kotlin-stdlib-jdk8</artifactId> + <version>${kotlin.version}</version> + </dependency> + <dependency> + <groupId>org.jetbrains.kotlin</groupId> + <artifactId>kotlin-stdlib-jdk7</artifactId> + <version>${kotlin.version}</version> + </dependency> + <dependency> + <groupId>org.jetbrains.kotlin</groupId> + <artifactId>kotlin-stdlib-common</artifactId> + <version>${kotlin.version}</version> + </dependency> </dependencies> </dependencyManagement> diff --git a/tika-parsers/tika-parsers-extended/pom.xml b/tika-parsers/tika-parsers-extended/pom.xml index 47317aa5db..77a333417e 100644 --- a/tika-parsers/tika-parsers-extended/pom.xml +++ b/tika-parsers/tika-parsers-extended/pom.xml @@ -33,8 +33,10 @@ <modules> <module>tika-parser-sqlite3-module</module> <module>tika-parser-scientific-module</module> + <module>tika-parser-jina-reader-module</module> <module>tika-parser-sqlite3-package</module> <module>tika-parser-scientific-package</module> + <module>tika-parser-jina-reader-package</module> <module>tika-parsers-extended-integration-tests</module> </modules> @@ -93,4 +95,4 @@ <scm> <tag>3.0.0-rc1</tag> </scm> -</project> \ No newline at end of file +</project> diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/pom.xml similarity index 73% copy from tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml copy to tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/pom.xml index 37eb466e9f..0977c6ed6d 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml +++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/pom.xml @@ -20,48 +20,20 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <artifactId>tika-parsers-ml</artifactId> + <artifactId>tika-parsers-extended</artifactId> <groupId>org.apache.tika</groupId> <version>${revision}</version> </parent> + <modelVersion>4.0.0</modelVersion> - <artifactId>tika-parser-vlm-ocr-module</artifactId> - <name>Apache Tika VLM OCR parser module</name> + <artifactId>tika-parser-jina-reader-module</artifactId> + <name>Apache Tika Jina Reader parser module</name> <properties> - <okhttp.version>4.12.0</okhttp.version> <commonmark.version>0.24.0</commonmark.version> </properties> - <dependencyManagement> - <dependencies> - <!-- align Kotlin stdlib versions pulled by OkHttp and Okio --> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-jdk8</artifactId> - <version>1.9.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib</artifactId> - <version>1.9.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-jdk7</artifactId> - <version>1.9.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-common</artifactId> - <version>1.9.10</version> - </dependency> - </dependencies> - </dependencyManagement> - <dependencies> <dependency> <groupId>com.squareup.okhttp3</groupId> @@ -109,7 +81,7 @@ <configuration> <archive> <manifestEntries> - <Automatic-Module-Name>org.apache.tika.parser.vlm</Automatic-Module-Name> + <Automatic-Module-Name>org.apache.tika.parser.jina.reader</Automatic-Module-Name> </manifestEntries> </archive> </configuration> diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderConfig.java b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderConfig.java new file mode 100644 index 0000000000..f8f7717a33 --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderConfig.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.jina; + +import java.io.Serializable; + +import org.apache.tika.exception.TikaConfigException; + +/** + * Configuration for {@link JinaReaderParser}. + * <p> + * Sends PDF (base64-encoded) or HTML (raw string) content to the + * <a href="https://jina.ai/reader/">Jina Reader API</a> and receives + * back clean markdown, which is then converted to XHTML. + */ +public class JinaReaderConfig implements Serializable { + + private static final long serialVersionUID = 1L; + + /** Jina Reader API endpoint. */ + private String baseUrl = "https://r.jina.ai/"; + + /** Bearer token for the Jina Reader API. */ + private String apiKey = ""; + + /** HTTP timeout in seconds. Jina Reader is a remote service; default is generous. */ + private int timeoutSeconds = 120; + + /** + * Response format requested from Jina Reader. + * Valid values: {@code markdown}, {@code html}, {@code text}, {@code screenshot}. + * Default is {@code markdown} since we convert it to XHTML. + */ + private String returnFormat = "markdown"; + + // ---- getters / setters ------------------------------------------------ + + public String getBaseUrl() { + return baseUrl; + } + + public void setBaseUrl(String baseUrl) throws TikaConfigException { + this.baseUrl = baseUrl; + } + + public String getApiKey() { + return apiKey; + } + + public void setApiKey(String apiKey) throws TikaConfigException { + this.apiKey = apiKey; + } + + public int getTimeoutSeconds() { + return timeoutSeconds; + } + + public void setTimeoutSeconds(int timeoutSeconds) { + this.timeoutSeconds = timeoutSeconds; + } + + public String getReturnFormat() { + return returnFormat; + } + + public void setReturnFormat(String returnFormat) { + this.returnFormat = returnFormat; + } +} diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderParser.java b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderParser.java new file mode 100644 index 0000000000..12f3efb97b --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/JinaReaderParser.java @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.jina; + +import static org.apache.tika.sax.XHTMLContentHandler.XHTML; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Base64; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import okhttp3.MediaType; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import okhttp3.RequestBody; +import okhttp3.Response; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.config.ConfigDeserializer; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.JsonConfig; +import org.apache.tika.config.TikaComponent; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.StringUtils; + +/** + * Parser that sends document content to the + * <a href="https://jina.ai/reader/">Jina Reader API</a> for clean-text + * extraction and returns the result as XHTML. + * <p> + * <b>Supported types:</b> + * <ul> + * <li>{@code application/pdf} — bytes are base64-encoded and sent as + * {@code {"pdf": "<base64>"}}</li> + * <li>{@code text/html} — raw HTML string sent as + * {@code {"html": "<html>..."}}</li> + * </ul> + * <p> + * Authentication: set {@code apiKey} in the config; it is sent as a + * {@code Authorization: Bearer <key>} header. + * <p> + * Configuration key: {@code "jina-reader-parser"} + * + * @since Apache Tika 4.0 + */ +@TikaComponent(name = "jina-reader-parser") +public class JinaReaderParser implements Parser, Initializable, Closeable { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(JinaReaderParser.class); + + private static final MediaType JSON_MEDIA_TYPE = + MediaType.parse("application/json; charset=utf-8"); + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private static final Set<org.apache.tika.mime.MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<>(Arrays.asList( + org.apache.tika.mime.MediaType.application("pdf"), + org.apache.tika.mime.MediaType.text("html") + ))); + + private final JinaReaderConfig config; + private transient OkHttpClient httpClient; + + public JinaReaderParser() { + this(new JinaReaderConfig()); + } + + public JinaReaderParser(JinaReaderConfig config) { + this.config = config; + buildHttpClient(); + } + + public JinaReaderParser(JsonConfig jsonConfig) { + this(ConfigDeserializer.buildConfig(jsonConfig, JinaReaderConfig.class)); + } + + // ---- Parser ----------------------------------------------------------- + + @Override + public Set<org.apache.tika.mime.MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + + JinaReaderConfig cfg = context.get(JinaReaderConfig.class, config); + + String contentType = metadata.get(Metadata.CONTENT_TYPE); + boolean isPdf = contentType != null && contentType.startsWith("application/pdf"); + + String requestJson = buildRequestJson(tis, isPdf); + + String markdown = callJinaApi(cfg, requestJson); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + xhtml.startElement(XHTML, "div", "div", new org.xml.sax.helpers.AttributesImpl()); + MarkdownToXHTMLEmitter.emit(markdown, xhtml); + xhtml.endElement(XHTML, "div", "div"); + xhtml.endDocument(); + } + + // ---- Initializable ---------------------------------------------------- + + @Override + public void initialize() { + buildHttpClient(); + } + + // ---- Closeable -------------------------------------------------------- + + @Override + public void close() { + if (httpClient != null) { + httpClient.dispatcher().executorService().shutdown(); + httpClient.connectionPool().evictAll(); + } + } + + // ---- helpers ---------------------------------------------------------- + + String buildRequestJson(TikaInputStream tis, boolean isPdf) throws IOException { + ObjectNode root = MAPPER.createObjectNode(); + if (isPdf) { + byte[] bytes = tis.readAllBytes(); + root.put("pdf", Base64.getEncoder().encodeToString(bytes)); + } else { + String html = new String(tis.readAllBytes(), StandardCharsets.UTF_8); + root.put("html", html); + } + return root.toString(); + } + + private String callJinaApi(JinaReaderConfig cfg, String requestJson) throws TikaException { + Request.Builder builder = new Request.Builder() + .url(cfg.getBaseUrl()) + .post(RequestBody.create(requestJson, JSON_MEDIA_TYPE)) + .header("Content-Type", "application/json") + .header("Accept", "application/json") + .header("X-Return-Format", cfg.getReturnFormat()); + + if (!StringUtils.isBlank(cfg.getApiKey())) { + builder.header("Authorization", "Bearer " + cfg.getApiKey()); + } + + Request request = builder.build(); + try (Response response = httpClient.newCall(request).execute()) { + if (!response.isSuccessful()) { + String body = response.body() != null ? response.body().string() : ""; + throw new TikaException( + "Jina Reader API request failed with HTTP " + + response.code() + ": " + body); + } + String responseBody = response.body() != null ? response.body().string() : ""; + return extractContent(responseBody); + } catch (IOException e) { + throw new TikaException("Jina Reader API request failed: " + e.getMessage(), e); + } + } + + String extractContent(String responseBody) throws TikaException { + try { + JsonNode root = MAPPER.readTree(responseBody); + JsonNode data = root.get("data"); + if (data == null) { + throw new TikaException( + "Jina Reader API response missing 'data' field: " + responseBody); + } + JsonNode content = data.get("content"); + if (content == null || content.isNull()) { + return ""; + } + return content.asText(); + } catch (IOException e) { + throw new TikaException( + "Failed to parse Jina Reader API response: " + e.getMessage(), e); + } + } + + private void buildHttpClient() { + httpClient = new OkHttpClient.Builder() + .connectTimeout(30, TimeUnit.SECONDS) + .readTimeout(config.getTimeoutSeconds(), TimeUnit.SECONDS) + .writeTimeout(60, TimeUnit.SECONDS) + .build(); + } + + // ---- config getters/setters for XML/JSON config wiring ---------------- + + public String getBaseUrl() { + return config.getBaseUrl(); + } + + public void setBaseUrl(String baseUrl) throws org.apache.tika.exception.TikaConfigException { + config.setBaseUrl(baseUrl); + } + + public String getApiKey() { + return config.getApiKey(); + } + + public void setApiKey(String apiKey) throws org.apache.tika.exception.TikaConfigException { + config.setApiKey(apiKey); + } + + public int getTimeoutSeconds() { + return config.getTimeoutSeconds(); + } + + public void setTimeoutSeconds(int timeoutSeconds) { + config.setTimeoutSeconds(timeoutSeconds); + } + + public String getReturnFormat() { + return config.getReturnFormat(); + } + + public void setReturnFormat(String returnFormat) { + config.setReturnFormat(returnFormat); + } + + // package-visible for tests + JinaReaderConfig getConfig() { + return config; + } +} diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/MarkdownToXHTMLEmitter.java b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/MarkdownToXHTMLEmitter.java new file mode 100644 index 0000000000..da5b84a1e5 --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/main/java/org/apache/tika/parser/jina/MarkdownToXHTMLEmitter.java @@ -0,0 +1,409 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.jina; + +import java.util.Arrays; +import java.util.List; + +import org.commonmark.Extension; +import org.commonmark.ext.gfm.strikethrough.Strikethrough; +import org.commonmark.ext.gfm.strikethrough.StrikethroughExtension; +import org.commonmark.ext.gfm.tables.TableBlock; +import org.commonmark.ext.gfm.tables.TableBody; +import org.commonmark.ext.gfm.tables.TableCell; +import org.commonmark.ext.gfm.tables.TableHead; +import org.commonmark.ext.gfm.tables.TableRow; +import org.commonmark.ext.gfm.tables.TablesExtension; +import org.commonmark.node.AbstractVisitor; +import org.commonmark.node.BlockQuote; +import org.commonmark.node.BulletList; +import org.commonmark.node.Code; +import org.commonmark.node.Document; +import org.commonmark.node.Emphasis; +import org.commonmark.node.FencedCodeBlock; +import org.commonmark.node.HardLineBreak; +import org.commonmark.node.Heading; +import org.commonmark.node.HtmlBlock; +import org.commonmark.node.HtmlInline; +import org.commonmark.node.Image; +import org.commonmark.node.IndentedCodeBlock; +import org.commonmark.node.Link; +import org.commonmark.node.ListItem; +import org.commonmark.node.Node; +import org.commonmark.node.OrderedList; +import org.commonmark.node.Paragraph; +import org.commonmark.node.SoftLineBreak; +import org.commonmark.node.StrongEmphasis; +import org.commonmark.node.Text; +import org.commonmark.node.ThematicBreak; +import org.commonmark.parser.Parser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Parses a markdown string using commonmark-java and emits XHTML SAX events. + * <p> + * Supports: + * <ul> + * <li>Headings ({@code h1}–{@code h6})</li> + * <li>Paragraphs ({@code p})</li> + * <li>Bold / italic / strikethrough ({@code b}, {@code i}, {@code s})</li> + * <li>Links ({@code a}) and images ({@code img})</li> + * <li>Ordered and unordered lists ({@code ol}, {@code ul}, {@code li})</li> + * <li>Blockquotes ({@code blockquote})</li> + * <li>Code blocks ({@code pre}/{@code code}) and inline code ({@code code})</li> + * <li>GFM tables ({@code table}, {@code thead}, {@code tbody}, {@code tr}, + * {@code th}, {@code td})</li> + * <li>Thematic breaks ({@code hr})</li> + * <li>Hard / soft line breaks ({@code br})</li> + * </ul> + * + * @since Apache Tika 4.0 + */ +class MarkdownToXHTMLEmitter { + + private static final List<Extension> EXTENSIONS = Arrays.asList( + TablesExtension.create(), + StrikethroughExtension.create() + ); + + private static final Parser PARSER = Parser.builder() + .extensions(EXTENSIONS) + .build(); + + private static final AttributesImpl EMPTY_ATTRS = new AttributesImpl(); + + /** + * Parses the given markdown text and emits SAX events to the handler. + * <p> + * The caller is responsible for calling {@code startDocument} / + * {@code endDocument} on the handler if desired — this method only emits + * the body-level elements. + * + * @param markdown the markdown text to parse + * @param handler the SAX content handler to receive events + * @throws SAXException if the handler throws + */ + static void emit(String markdown, ContentHandler handler) throws SAXException { + if (markdown == null || markdown.isEmpty()) { + return; + } + Node document = PARSER.parse(markdown); + SAXVisitor visitor = new SAXVisitor(handler); + document.accept(visitor); + if (visitor.saxException != null) { + throw visitor.saxException; + } + } + + /** + * commonmark AST visitor that fires SAX events for each node. + */ + private static class SAXVisitor extends AbstractVisitor { + + private final ContentHandler handler; + SAXException saxException; + + SAXVisitor(ContentHandler handler) { + this.handler = handler; + } + + // --- block nodes --- + + @Override + public void visit(Document document) { + visitChildren(document); + } + + @Override + public void visit(Heading heading) { + String tag = "h" + heading.getLevel(); + startElement(tag); + visitChildren(heading); + endElement(tag); + } + + @Override + public void visit(Paragraph paragraph) { + // Skip wrapping <p> inside list items — commonmark wraps + // "loose" list item content in Paragraph nodes, which would + // produce <li><p>text</p></li>. We emit the text directly. + if (paragraph.getParent() instanceof ListItem) { + visitChildren(paragraph); + return; + } + startElement("p"); + visitChildren(paragraph); + endElement("p"); + } + + @Override + public void visit(BlockQuote blockQuote) { + startElement("blockquote"); + visitChildren(blockQuote); + endElement("blockquote"); + } + + @Override + public void visit(BulletList bulletList) { + startElement("ul"); + visitChildren(bulletList); + endElement("ul"); + } + + @Override + public void visit(OrderedList orderedList) { + startElement("ol"); + visitChildren(orderedList); + endElement("ol"); + } + + @Override + public void visit(ListItem listItem) { + startElement("li"); + visitChildren(listItem); + endElement("li"); + } + + @Override + public void visit(FencedCodeBlock fencedCodeBlock) { + AttributesImpl attrs = EMPTY_ATTRS; + String info = fencedCodeBlock.getInfo(); + if (info != null && !info.isEmpty()) { + attrs = new AttributesImpl(); + attrs.addAttribute("", "class", "class", "CDATA", + "language-" + info.split("\\s+")[0]); + } + startElement("pre"); + startElement("code", attrs); + characters(fencedCodeBlock.getLiteral()); + endElement("code"); + endElement("pre"); + } + + @Override + public void visit(IndentedCodeBlock indentedCodeBlock) { + startElement("pre"); + startElement("code"); + characters(indentedCodeBlock.getLiteral()); + endElement("code"); + endElement("pre"); + } + + @Override + public void visit(ThematicBreak thematicBreak) { + emptyElement("hr"); + } + + @Override + public void visit(HtmlBlock htmlBlock) { + // Emit raw HTML content as plain text — we don't parse nested HTML + characters(htmlBlock.getLiteral()); + } + + // --- inline nodes --- + + @Override + public void visit(Text text) { + characters(text.getLiteral()); + } + + @Override + public void visit(StrongEmphasis strongEmphasis) { + startElement("b"); + visitChildren(strongEmphasis); + endElement("b"); + } + + @Override + public void visit(Emphasis emphasis) { + startElement("i"); + visitChildren(emphasis); + endElement("i"); + } + + @Override + public void visit(Code code) { + startElement("code"); + characters(code.getLiteral()); + endElement("code"); + } + + @Override + public void visit(Link link) { + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", "href", "href", "CDATA", link.getDestination()); + if (link.getTitle() != null && !link.getTitle().isEmpty()) { + attrs.addAttribute("", "title", "title", "CDATA", link.getTitle()); + } + startElement("a", attrs); + visitChildren(link); + endElement("a"); + } + + @Override + public void visit(Image image) { + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", "src", "src", "CDATA", image.getDestination()); + if (image.getTitle() != null && !image.getTitle().isEmpty()) { + attrs.addAttribute("", "title", "title", "CDATA", image.getTitle()); + } + // Use alt text from child text nodes + StringBuilder alt = new StringBuilder(); + Node child = image.getFirstChild(); + while (child != null) { + if (child instanceof Text) { + alt.append(((Text) child).getLiteral()); + } + child = child.getNext(); + } + attrs.addAttribute("", "alt", "alt", "CDATA", alt.toString()); + emptyElement("img", attrs); + } + + @Override + public void visit(HardLineBreak hardLineBreak) { + emptyElement("br"); + } + + @Override + public void visit(SoftLineBreak softLineBreak) { + characters(" "); + } + + @Override + public void visit(HtmlInline htmlInline) { + // Emit inline HTML as plain text + characters(htmlInline.getLiteral()); + } + + // --- GFM extensions --- + + @Override + public void visit(org.commonmark.node.CustomBlock customBlock) { + if (customBlock instanceof TableBlock) { + startElement("table"); + visitChildren(customBlock); + endElement("table"); + } else { + visitChildren(customBlock); + } + } + + @Override + public void visit(org.commonmark.node.CustomNode customNode) { + if (customNode instanceof TableHead) { + startElement("thead"); + visitChildren(customNode); + endElement("thead"); + } else if (customNode instanceof TableBody) { + startElement("tbody"); + visitChildren(customNode); + endElement("tbody"); + } else if (customNode instanceof TableRow) { + startElement("tr"); + visitChildren(customNode); + endElement("tr"); + } else if (customNode instanceof TableCell) { + TableCell cell = (TableCell) customNode; + String tag = cell.isHeader() ? "th" : "td"; + AttributesImpl attrs = EMPTY_ATTRS; + TableCell.Alignment alignment = cell.getAlignment(); + if (alignment != null) { + attrs = new AttributesImpl(); + String align; + switch (alignment) { + case LEFT: + align = "left"; + break; + case CENTER: + align = "center"; + break; + case RIGHT: + align = "right"; + break; + default: + align = null; + break; + } + if (align != null) { + attrs.addAttribute("", "align", "align", "CDATA", align); + } + } + startElement(tag, attrs); + visitChildren(customNode); + endElement(tag); + } else if (customNode instanceof Strikethrough) { + startElement("s"); + visitChildren(customNode); + endElement("s"); + } else { + visitChildren(customNode); + } + } + + // --- SAX helpers --- + + private void startElement(String localName) { + startElement(localName, EMPTY_ATTRS); + } + + private void startElement(String localName, AttributesImpl attrs) { + if (saxException != null) { + return; + } + try { + handler.startElement("", localName, localName, attrs); + } catch (SAXException e) { + saxException = e; + } + } + + private void endElement(String localName) { + if (saxException != null) { + return; + } + try { + handler.endElement("", localName, localName); + } catch (SAXException e) { + saxException = e; + } + } + + private void emptyElement(String localName) { + emptyElement(localName, EMPTY_ATTRS); + } + + private void emptyElement(String localName, AttributesImpl attrs) { + startElement(localName, attrs); + endElement(localName); + } + + private void characters(String text) { + if (saxException != null || text == null || text.isEmpty()) { + return; + } + try { + char[] chars = text.toCharArray(); + handler.characters(chars, 0, chars.length); + } catch (SAXException e) { + saxException = e; + } + } + } +} diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/test/java/org/apache/tika/parser/jina/JinaReaderParserTest.java b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/test/java/org/apache/tika/parser/jina/JinaReaderParserTest.java new file mode 100644 index 0000000000..43d6ded190 --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-module/src/test/java/org/apache/tika/parser/jina/JinaReaderParserTest.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.jina; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import java.util.Base64; +import java.util.Set; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import okhttp3.mockwebserver.MockResponse; +import okhttp3.mockwebserver.MockWebServer; +import okhttp3.mockwebserver.RecordedRequest; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; + +public class JinaReaderParserTest { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private MockWebServer server; + private JinaReaderParser parser; + private JinaReaderConfig config; + + @BeforeEach + void setUp() throws Exception { + server = new MockWebServer(); + server.start(); + + config = new JinaReaderConfig(); + config.setBaseUrl(server.url("/").toString()); + config.setApiKey("test-key"); + config.setTimeoutSeconds(10); + + parser = new JinaReaderParser(config); + } + + @AfterEach + void tearDown() throws Exception { + server.shutdown(); + parser.close(); + } + + @Test + void testPdfParsing() throws Exception { + String markdown = "# My PDF Title\n\nSome paragraph text."; + server.enqueue(new MockResponse() + .setBody(buildJinaResponse(markdown)) + .setHeader("Content-Type", "application/json")); + + byte[] fakePdf = "%PDF-1.4 fake pdf content".getBytes(StandardCharsets.UTF_8); + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); + + BodyContentHandler handler = new BodyContentHandler(); + try (TikaInputStream tis = TikaInputStream.get(new ByteArrayInputStream(fakePdf))) { + parser.parse(tis, handler, metadata, new ParseContext()); + } + + assertTrue(handler.toString().contains("My PDF Title")); + assertTrue(handler.toString().contains("Some paragraph text.")); + + RecordedRequest request = server.takeRequest(); + assertEquals("POST", request.getMethod()); + assertEquals("Bearer test-key", request.getHeader("Authorization")); + assertEquals("markdown", request.getHeader("X-Return-Format")); + + JsonNode body = MAPPER.readTree(request.getBody().readUtf8()); + assertTrue(body.has("pdf"), "Request should have 'pdf' field"); + String decoded = new String(Base64.getDecoder().decode(body.get("pdf").asText()), + StandardCharsets.UTF_8); + assertTrue(decoded.startsWith("%PDF")); + } + + @Test + void testHtmlParsing() throws Exception { + String markdown = "## Article Heading\n\nClean content here."; + server.enqueue(new MockResponse() + .setBody(buildJinaResponse(markdown)) + .setHeader("Content-Type", "application/json")); + + String html = "<html><body><nav>skip</nav><article><h1>Article</h1>" + + "<p>Content</p></article></body></html>"; + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "text/html"); + + BodyContentHandler handler = new BodyContentHandler(); + try (TikaInputStream tis = TikaInputStream.get( + new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)))) { + parser.parse(tis, handler, metadata, new ParseContext()); + } + + assertTrue(handler.toString().contains("Article Heading")); + + RecordedRequest request = server.takeRequest(); + JsonNode body = MAPPER.readTree(request.getBody().readUtf8()); + assertTrue(body.has("html"), "Request should have 'html' field"); + assertTrue(body.get("html").asText().contains("<html>")); + } + + @Test + void testApiError() throws Exception { + server.enqueue(new MockResponse() + .setResponseCode(400) + .setBody("{\"error\":\"No URL provided\"}")); + + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); + + assertThrows(TikaException.class, () -> { + try (TikaInputStream tis = TikaInputStream.get( + new ByteArrayInputStream(new byte[]{1, 2, 3}))) { + parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext()); + } + }); + } + + @Test + void testExtractContent() throws TikaException { + String response = buildJinaResponse("Hello **world**"); + String content = parser.extractContent(response); + assertEquals("Hello **world**", content); + } + + @Test + void testExtractContentMissingData() { + assertThrows(TikaException.class, + () -> parser.extractContent("{\"code\":200}")); + } + + @Test + void testSupportedTypes() { + Set<org.apache.tika.mime.MediaType> types = + parser.getSupportedTypes(new ParseContext()); + assertTrue(types.stream().anyMatch(mt -> mt.toString().equals("application/pdf"))); + assertTrue(types.stream().anyMatch(mt -> mt.toString().equals("text/html"))); + } + + @Test + void testNoApiKeyHeader() throws Exception { + config.setApiKey(""); + parser = new JinaReaderParser(config); + + server.enqueue(new MockResponse() + .setBody(buildJinaResponse("content")) + .setHeader("Content-Type", "application/json")); + + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "text/html"); + + try (TikaInputStream tis = TikaInputStream.get( + new ByteArrayInputStream("<html/>".getBytes(StandardCharsets.UTF_8)))) { + parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext()); + } + + RecordedRequest request = server.takeRequest(); + assertTrue(request.getHeader("Authorization") == null + || request.getHeader("Authorization").isEmpty(), + "No auth header expected when apiKey is blank"); + } + + private String buildJinaResponse(String content) { + String escaped = content.replace("\\", "\\\\").replace("\"", "\\\"") + .replace("\n", "\\n"); + return "{\"code\":200,\"data\":{\"content\":\"" + escaped + "\"}}"; + } +} diff --git a/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-package/pom.xml b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-package/pom.xml new file mode 100644 index 0000000000..c27cb7deb5 --- /dev/null +++ b/tika-parsers/tika-parsers-extended/tika-parser-jina-reader-package/pom.xml @@ -0,0 +1,109 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>tika-parsers-extended</artifactId> + <groupId>org.apache.tika</groupId> + <version>${revision}</version> + </parent> + <modelVersion>4.0.0</modelVersion> + + <artifactId>tika-parser-jina-reader-package</artifactId> + <name>Apache Tika Jina Reader parser package</name> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-parser-jina-reader-module</artifactId> + <version>${project.version}</version> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <configuration> + <archive> + <manifestEntries> + <Automatic-Module-Name>org.apache.tika.parser.jina.reader</Automatic-Module-Name> + </manifestEntries> + </archive> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>${maven.shade.version}</version> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <createDependencyReducedPom>false</createDependencyReducedPom> + <filters> + <filter> + <artifact>*:*</artifact> + <excludes> + <exclude>module-info.class</exclude> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + <exclude>META-INF/DEPENDENCIES</exclude> + <exclude>META-INF/MANIFEST.MF</exclude> + <exclude>META-INF/LICENSE.md</exclude> + <exclude>META-INF/NOTICE.md</exclude> + </excludes> + </filter> + </filters> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"> + <addHeader>false</addHeader> + </transformer> + <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer" /> + <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" /> + <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer"> + <resource>META-INF/LICENSE</resource> + <file>target/classes/META-INF/LICENSE</file> + </transformer> + </transformers> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.rat</groupId> + <artifactId>apache-rat-plugin</artifactId> + <configuration> + <inputExcludes> + <inputExclude>dependency-reduced-pom.xml</inputExclude> + </inputExcludes> + </configuration> + </plugin> + </plugins> + </build> + + <scm> + <tag>3.0.0-rc1</tag> + </scm> +</project> diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml b/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml index 37eb466e9f..c21da92c12 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml +++ b/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml @@ -32,36 +32,9 @@ <name>Apache Tika VLM OCR parser module</name> <properties> - <okhttp.version>4.12.0</okhttp.version> <commonmark.version>0.24.0</commonmark.version> </properties> - <dependencyManagement> - <dependencies> - <!-- align Kotlin stdlib versions pulled by OkHttp and Okio --> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-jdk8</artifactId> - <version>1.9.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib</artifactId> - <version>1.9.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-jdk7</artifactId> - <version>1.9.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-common</artifactId> - <version>1.9.10</version> - </dependency> - </dependencies> - </dependencyManagement> - <dependencies> <dependency> <groupId>com.squareup.okhttp3</groupId>
