This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4653-markdown-handler in repository https://gitbox.apache.org/repos/asf/tika.git
commit 1bc86c4699cdf8688e4f94cbd9ad13ea10fefd41 Author: tallison <[email protected]> AuthorDate: Mon Feb 9 17:00:13 2026 -0500 TIKA-4656 and clean up md integration --- docs/modules/ROOT/nav.adoc | 1 + docs/modules/ROOT/pages/pipes/index.adoc | 1 + docs/modules/ROOT/pages/pipes/parse-modes.adoc | 139 +++++++++++++++++++++ docs/modules/ROOT/pages/using-tika/cli/index.adoc | 75 +++++++++++ .../ROOT/pages/using-tika/java-api/index.adoc | 14 ++- .../ROOT/pages/using-tika/server/index.adoc | 54 ++++++++ .../src/main/java/org/apache/tika/cli/TikaCLI.java | 10 ++ .../org/apache/tika/async/cli/PluginsWriter.java | 31 +++++ .../apache/tika/async/cli/SimpleAsyncConfig.java | 19 ++- .../org/apache/tika/async/cli/TikaAsyncCLI.java | 14 ++- .../java/org/apache/tika/pipes/api/ParseMode.java | 16 ++- .../apache/tika/pipes/core/server/EmitHandler.java | 36 +++++- .../tika/pipes/core/server/ParseHandler.java | 5 + .../apache/tika/pipes/core/PipesClientTest.java | 86 +++++++++++++ .../tika/pipes/emitter/fs/FileSystemEmitter.java | 2 +- .../tika/server/core/resource/TikaResource.java | 7 +- 16 files changed, 499 insertions(+), 11 deletions(-) diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 89ea2c0c8c..eaeddf383f 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -21,6 +21,7 @@ ** xref:using-tika/cli/index.adoc[Command Line] ** xref:using-tika/grpc/index.adoc[gRPC] * xref:pipes/index.adoc[Pipes] +** xref:pipes/parse-modes.adoc[Parse Modes] ** xref:pipes/unpack-config.adoc[Extracting Embedded Bytes] * xref:configuration/index.adoc[Configuration] ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser] diff --git a/docs/modules/ROOT/pages/pipes/index.adoc b/docs/modules/ROOT/pages/pipes/index.adoc index 899a427514..ff67ab6e0c 100644 --- a/docs/modules/ROOT/pages/pipes/index.adoc +++ b/docs/modules/ROOT/pages/pipes/index.adoc @@ -29,6 +29,7 @@ Tika Pipes provides a framework for processing large volumes of documents with: == Topics +* xref:pipes/parse-modes.adoc[Parse Modes] - Control how documents are parsed and emitted (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`, `UNPACK`) * xref:pipes/unpack-config.adoc[Extracting Embedded Bytes] - Extract raw bytes from embedded documents using `ParseMode.UNPACK` // Add links to specific topics as they are created diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc b/docs/modules/ROOT/pages/pipes/parse-modes.adoc new file mode 100644 index 0000000000..a023d0b406 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc @@ -0,0 +1,139 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Parse Modes + +Tika Pipes uses `ParseMode` to control how documents are parsed and how results are emitted. +The parse mode is set on the `ParseContext` or configured in `PipesConfig`. + +== Available Parse Modes + +[cols="1,3"] +|=== +|Mode |Description + +|`RMETA` +|Default mode. Each embedded document produces a separate `Metadata` object. +Results are returned as a JSON array of metadata objects. + +|`CONCATENATE` +|All content from embedded documents is concatenated into a single content field. +Results are returned as a single `Metadata` object with all metadata preserved. + +|`CONTENT_ONLY` +|Parses like `CONCATENATE` but emits only the raw extracted content — no JSON wrapper, +no metadata fields. Useful when you want just the text, markdown, or HTML output. + +|`NO_PARSE` +|Skip parsing entirely. Useful for pipelines that only need to fetch and emit raw bytes. + +|`UNPACK` +|Extract raw bytes from embedded documents. See xref:pipes/unpack-config.adoc[Extracting Embedded Bytes]. +|=== + +== CONCATENATE Mode + +`CONCATENATE` merges all content from embedded documents into a single content field +while preserving all metadata from parsing: + +[source,json] +---- +{ + "parseContext": { + "parseMode": "CONCATENATE" + } +} +---- + +The result is a single `Metadata` object containing the concatenated content in +`X-TIKA:content` along with all other metadata fields (title, author, content type, etc.). + +== CONTENT_ONLY Mode + +`CONTENT_ONLY` is designed for use cases where you want just the extracted content +written to storage — no JSON wrapping, no metadata overhead. This is particularly +useful for: + +* Extracting markdown files from a document corpus +* Building plain text search indexes +* Generating HTML versions of documents + +[source,json] +---- +{ + "parseContext": { + "parseMode": "CONTENT_ONLY" + } +} +---- + +=== How It Works + +1. Documents are parsed identically to `CONCATENATE` mode — all embedded content is + merged into a single content field. +2. A metadata filter automatically strips all metadata except `X-TIKA:content` and + `X-TIKA:CONTAINER_EXCEPTION` (for error tracking). +3. When the emitter is a `StreamEmitter` (such as the filesystem or S3 emitter), the + raw content string is written directly as bytes — no JSON serialization. + +=== Metadata Filtering + +By default, `CONTENT_ONLY` mode applies an `IncludeFieldMetadataFilter` that retains +only `X-TIKA:content` and `X-TIKA:CONTAINER_EXCEPTION`. If you set your own +`MetadataFilter` on the `ParseContext`, your filter takes priority. + +=== CLI Usage + +The `tika-async-cli` batch processor supports `CONTENT_ONLY` via the `--content-only` +flag: + +[source,bash] +---- +java -jar tika-async-cli.jar -i /input -o /output -h m --content-only +---- + +This produces `.md` files (when using the `m` handler type) containing only the +extracted markdown content. + +=== Content Handler Types + +The content format depends on the configured handler type: + +[cols="1,1,2"] +|=== +|Handler |Extension |Description + +|`t` (text) +|`.txt` +|Plain text output + +|`h` (html) +|`.html` +|HTML output + +|`x` (xml) +|`.xml` +|XHTML output + +|`m` (markdown) +|`.md` +|Markdown output + +|`b` (body) +|`.txt` +|Body content handler output +|=== diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc b/docs/modules/ROOT/pages/using-tika/cli/index.adoc index 0e19ed6170..17a631e1f8 100644 --- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc +++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc @@ -83,6 +83,9 @@ java -jar tika-app.jar [option...] [file|port...] |`-t` or `--text` |Output plain text +|`--md` +|Output Markdown + |`-m` or `--metadata` |Output metadata only @@ -124,6 +127,13 @@ Process entire directories by specifying input and output paths: java -jar tika-app.jar -i /path/to/input -o /path/to/output ---- +=== Extract Markdown from a file + +[source,bash] +---- +java -jar tika-app.jar --md document.docx +---- + === Custom configuration Use a custom configuration file: @@ -132,3 +142,68 @@ Use a custom configuration file: ---- java -jar tika-app.jar --config=tika-config.json document.pdf ---- + +== Batch Processing (tika-async-cli) + +For processing large numbers of files, use `tika-async-cli`. It uses the Tika Pipes +architecture with forked JVM processes for fault tolerance. + +=== Basic Batch Usage + +[source,bash] +---- +java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output +---- + +This processes all files in the input directory and writes JSON metadata (RMETA format) +to the output directory. + +=== Batch Options + +[cols="1,3"] +|=== +|Option |Description + +|`-i` +|Input directory + +|`-o` +|Output directory + +|`-h` or `--handlerType` +|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body, `i`=ignore (default: `t`) + +|`--concatenate` +|Concatenate content from all embedded documents into a single content field + +|`--content-only` +|Output only extracted content (no metadata, no JSON wrapper); implies `--concatenate` + +|`-T` or `--timeoutMs` +|Timeout for each parse in milliseconds + +|`-n` or `--numClients` +|Number of parallel forked processes + +|`-p` or `--pluginsDir` +|Plugins directory +|=== + +=== Batch Examples + +Extract markdown content only (no metadata) from all files: + +[source,bash] +---- +java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output -h m --content-only +---- + +This produces `.md` files in the output directory containing just the extracted markdown +content — no JSON wrappers, no metadata fields. + +Extract text with all metadata in concatenated mode: + +[source,bash] +---- +java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output --concatenate +---- diff --git a/docs/modules/ROOT/pages/using-tika/java-api/index.adoc b/docs/modules/ROOT/pages/using-tika/java-api/index.adoc index 5b2ff99930..4853446d50 100644 --- a/docs/modules/ROOT/pages/using-tika/java-api/index.adoc +++ b/docs/modules/ROOT/pages/using-tika/java-api/index.adoc @@ -100,12 +100,24 @@ For example, use `TikaInputStream.get(path)` for a `Path`, or `TikaInputStream.g for a `byte[]`. This allows Tika to access the underlying resource efficiently and enables features like mark/reset support that many parsers and detectors require. -=== Utility Classes +=== Content Handlers + +Tika provides several content handlers that control the output format: **BodyContentHandler**:: Extracts and converts the body content to streams or strings. +**ToTextContentHandler**:: Outputs plain text. + +**ToHTMLContentHandler**:: Outputs HTML. + +**ToXMLContentHandler**:: Outputs XHTML/XML. + +**ToMarkdownContentHandler**:: Outputs Markdown, preserving structural semantics like headings, lists, tables, code blocks, emphasis, and links. + **ParsingReader**:: Uses background threading to return extracted text as character streams. +Use `BasicContentHandlerFactory` to create handlers by type: `TEXT`, `HTML`, `XML`, `BODY`, `MARKDOWN`, `IGNORE`. + === Key Metadata Properties * `TikaCoreProperties.RESOURCE_NAME_KEY` - filename or resource identifier diff --git a/docs/modules/ROOT/pages/using-tika/server/index.adoc b/docs/modules/ROOT/pages/using-tika/server/index.adoc index dbb086b5e6..1eee6a448f 100644 --- a/docs/modules/ROOT/pages/using-tika/server/index.adoc +++ b/docs/modules/ROOT/pages/using-tika/server/index.adoc @@ -33,6 +33,60 @@ java -jar tika-server-standard.jar The server starts on port 9998 by default. +== Endpoints + +=== Content Extraction (`/tika`) + +The `/tika` endpoint extracts content from a document as plain text. + +[source,bash] +---- +curl -T document.pdf http://localhost:9998/tika +---- + +==== Markdown Output (`/tika/md`) + +The `/tika/md` endpoint extracts content as Markdown, preserving structural semantics +like headings, lists, tables, and emphasis: + +[source,bash] +---- +curl -T document.docx http://localhost:9998/tika/md +---- + +==== Custom Handler Type + +Use the `X-Tika-Handler` header to control the output format. Valid values: `text` (default), +`html`, `xml`, `markdown`, `ignore`. + +[source,bash] +---- +curl -T document.pdf -H "X-Tika-Handler: markdown" http://localhost:9998/tika +---- + +=== Recursive Metadata (`/rmeta`) + +The `/rmeta` endpoint returns metadata for the container document and all embedded documents +as a JSON array of metadata objects. + +[source,bash] +---- +curl -T document.pdf http://localhost:9998/rmeta +---- + +Content handler can be specified in the URL path: + +* `/rmeta/text` - plain text content (default) +* `/rmeta/html` - HTML content +* `/rmeta/xml` - XHTML content +* `/rmeta/markdown` - Markdown content +* `/rmeta/ignore` - metadata only, no content + +[source,bash] +---- +curl -T document.docx http://localhost:9998/rmeta/markdown +---- + == Topics * xref:using-tika/server/tls.adoc[TLS/SSL Configuration] - Secure your server with TLS and mutual authentication diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index dfcc299520..002c92acf0 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -99,6 +99,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.ExpandedTitleContentHandler; +import org.apache.tika.sax.ToMarkdownContentHandler; import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.sax.WriteOutContentHandler; import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler; @@ -225,6 +226,12 @@ public class TikaCLI { * Fork mode plugins directory. */ private String forkPluginsDir = null; + private final OutputType MARKDOWN = new OutputType() { + @Override + protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { + return new BodyContentHandler(new ToMarkdownContentHandler(getOutputWriter(output, encoding))); + } + }; private final OutputType XML = new OutputType() { @Override protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { @@ -483,6 +490,8 @@ public class TikaCLI { type = XML; } else if (arg.equals("-h") || arg.equals("--html")) { type = HTML; + } else if (arg.equals("--md")) { + type = MARKDOWN; } else if (arg.equals("-t") || arg.equals("--text")) { type = TEXT; } else if (arg.equals("-T") || arg.equals("--text-main")) { @@ -744,6 +753,7 @@ public class TikaCLI { out.println(" -x or --xml Output XHTML content (default)"); out.println(" -h or --html Output HTML content"); out.println(" -t or --text Output plain text content (body)"); + out.println(" --md Output Markdown content (body)"); out.println(" -T or --text-main Output plain text content (main content only via boilerpipe handler)"); out.println(" -A or --text-all Output all text content"); out.println(" -m or --metadata Output only metadata"); diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java index 1257c48e4c..dd2b02d067 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java @@ -27,7 +27,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ObjectNode; import org.apache.tika.config.loader.TikaObjectMapperFactory; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.core.PipesConfig; +import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.utils.StringUtils; public class PluginsWriter { @@ -101,6 +103,24 @@ public class PluginsWriter { if (simpleAsyncConfig.getTimeoutMs() != null) { pipesConfig.setTimeoutMillis(simpleAsyncConfig.getTimeoutMs()); } + if (simpleAsyncConfig.isContentOnly()) { + pipesConfig.setParseMode(ParseMode.CONTENT_ONLY); + } else if (simpleAsyncConfig.isConcatenate()) { + pipesConfig.setParseMode(ParseMode.CONCATENATE); + } + + // For content-only mode, change the emitter file extension based on handler type + if (simpleAsyncConfig.isContentOnly()) { + String ext = getFileExtensionForHandlerType(simpleAsyncConfig.getHandlerType()); + if (emitters != null && emitters.has("fse")) { + ObjectNode fse = (ObjectNode) emitters.get("fse"); + if (fse != null && fse.has("file-system-emitter")) { + ObjectNode fsEmitter = (ObjectNode) fse.get("file-system-emitter"); + fsEmitter.put("fileExtension", ext); + } + } + } + root.set("pipes", objectMapper.valueToTree(pipesConfig)); objectMapper.writerWithDefaultPrettyPrinter().writeValue(output.toFile(), root); @@ -108,4 +128,15 @@ public class PluginsWriter { throw new IOException(e); } } + + private static String getFileExtensionForHandlerType( + BasicContentHandlerFactory.HANDLER_TYPE handlerType) { + return switch (handlerType) { + case MARKDOWN -> "md"; + case HTML -> "html"; + case XML -> "xml"; + case BODY, TEXT -> "txt"; + default -> "txt"; + }; + } } diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java index 0abefc9267..5ea5e764ba 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java @@ -37,6 +37,10 @@ class SimpleAsyncConfig { private final BasicContentHandlerFactory.HANDLER_TYPE handlerType; private final String pluginsDir; + // Parse mode options + private final boolean concatenate; + private final boolean contentOnly; + // Frictionless Data Package options private final String unpackFormat; // "REGULAR" or "FRICTIONLESS" private final String unpackMode; // "ZIPPED" or "DIRECTORY" @@ -47,12 +51,13 @@ class SimpleAsyncConfig { String tikaConfig, BasicContentHandlerFactory.HANDLER_TYPE handlerType, ExtractBytesMode extractBytesMode, String pluginsDir) { this(inputDir, outputDir, numClients, timeoutMs, xmx, fileList, tikaConfig, handlerType, - extractBytesMode, pluginsDir, null, null, false); + extractBytesMode, pluginsDir, false, false, null, null, false); } public SimpleAsyncConfig(String inputDir, String outputDir, Integer numClients, Long timeoutMs, String xmx, String fileList, String tikaConfig, BasicContentHandlerFactory.HANDLER_TYPE handlerType, ExtractBytesMode extractBytesMode, String pluginsDir, + boolean concatenate, boolean contentOnly, String unpackFormat, String unpackMode, boolean unpackIncludeMetadata) { this.inputDir = inputDir; this.outputDir = outputDir; @@ -64,6 +69,8 @@ class SimpleAsyncConfig { this.handlerType = handlerType; this.extractBytesMode = extractBytesMode; this.pluginsDir = pluginsDir; + this.concatenate = concatenate; + this.contentOnly = contentOnly; this.unpackFormat = unpackFormat; this.unpackMode = unpackMode; this.unpackIncludeMetadata = unpackIncludeMetadata; @@ -109,6 +116,14 @@ class SimpleAsyncConfig { return pluginsDir; } + public boolean isConcatenate() { + return concatenate; + } + + public boolean isContentOnly() { + return contentOnly; + } + public String getUnpackFormat() { return unpackFormat; } @@ -134,6 +149,8 @@ class SimpleAsyncConfig { ", extractBytesMode=" + extractBytesMode + ", handlerType=" + handlerType + ", pluginsDir='" + pluginsDir + '\'' + + ", concatenate=" + concatenate + + ", contentOnly=" + contentOnly + ", unpackFormat='" + unpackFormat + '\'' + ", unpackMode='" + unpackMode + '\'' + ", unpackIncludeMetadata=" + unpackIncludeMetadata + diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java index 4687845389..a28e0b26f9 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java @@ -67,12 +67,14 @@ public class TikaAsyncCLI { options.addOption("X", "Xmx", true, "heap for the forked clients in usual jvm heap amount, e.g. -X 1g"); options.addOption("?", "help", false, "this help message"); options.addOption("T", "timeoutMs", true, "timeout for each parse in milliseconds"); - options.addOption("h", "handlerType", true, "handler type: t=text, h=html, x=xml, b=body, i=ignore"); + options.addOption("h", "handlerType", true, "handler type: t=text, h=html, x=xml, m=markdown, b=body, i=ignore"); options.addOption("p", "pluginsDir", true, "plugins directory"); //options.addOption("l", "fileList", true, "file list"); options.addOption("c", "config", true, "tikaConfig.json"); options.addOption("z", "unzipShallow", false, "extract raw bytes from direct attachments only (depth=1)"); options.addOption("Z", "unzipRecursive", false, "extract raw bytes from all attachments recursively"); + options.addOption(null, "concatenate", false, "concatenate content from all embedded documents into a single content field"); + options.addOption(null, "content-only", false, "output only extracted content (no metadata, no JSON wrapper); implies --concatenate"); options.addOption(null, "unpack-format", true, "output format for unpacking: REGULAR (default) or FRICTIONLESS"); options.addOption(null, "unpack-mode", true, @@ -224,6 +226,10 @@ public class TikaAsyncCLI { pluginsDir = line.getOptionValue('p'); } + // Parse mode options + boolean contentOnly = line.hasOption("content-only"); + boolean concatenate = line.hasOption("concatenate") || contentOnly; + // Frictionless Data Package options String unpackFormat = null; String unpackMode = null; @@ -282,17 +288,19 @@ public class TikaAsyncCLI { return new SimpleAsyncConfig(inputDir, outputDir, numClients, timeoutMs, xmx, fileList, tikaConfig, handlerType, - extractBytesMode, pluginsDir, unpackFormat, unpackMode, unpackIncludeMetadata); + extractBytesMode, pluginsDir, concatenate, contentOnly, + unpackFormat, unpackMode, unpackIncludeMetadata); } private static BasicContentHandlerFactory.HANDLER_TYPE getHandlerType(String t) throws TikaConfigException { return switch (t) { case "x" -> BasicContentHandlerFactory.HANDLER_TYPE.XML; case "h" -> BasicContentHandlerFactory.HANDLER_TYPE.HTML; + case "m" -> BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN; case "b" -> BasicContentHandlerFactory.HANDLER_TYPE.BODY; case "i" -> BasicContentHandlerFactory.HANDLER_TYPE.IGNORE; case "t" -> BasicContentHandlerFactory.HANDLER_TYPE.TEXT; - default -> throw new TikaConfigException("Can't understand " + t + " as a handler type. Must be one of: x(ml), h(tml), b(ody), i(gnore), t(ext)"); + default -> throw new TikaConfigException("Can't understand " + t + " as a handler type. Must be one of: x(ml), h(tml), m(arkdown), b(ody), i(gnore), t(ext)"); }; } diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java index 3513de3b55..e6127d5005 100644 --- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java +++ b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java @@ -53,6 +53,20 @@ public enum ParseMode { */ NO_PARSE, + /** + * Concatenates content and emits only the raw content string, with no + * metadata and no JSON wrapper. + * <p> + * This mode parses like CONCATENATE (producing a single metadata object with + * merged content from all embedded documents), but at emit time, emitters + * write only the value of {@code X-TIKA:content} as a raw string instead of + * serializing the full metadata list as JSON. + * <p> + * This is useful when you want plain text, markdown, or HTML output files + * without any metadata overhead. + */ + CONTENT_ONLY, + /** * Extracts embedded document bytes and emits them, with full RMETA metadata. * <p> @@ -88,7 +102,7 @@ public enum ParseMode { } catch (IllegalArgumentException e) { throw new IllegalArgumentException( "Invalid parse mode: '" + modeString + "'. " + - "Must be one of: RMETA, CONCATENATE, NO_PARSE, UNPACK"); + "Must be one of: RMETA, CONCATENATE, CONTENT_ONLY, NO_PARSE, UNPACK"); } } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java index cea86da760..e60f1f5c84 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java @@ -19,8 +19,12 @@ package org.apache.tika.pipes.core.server; import static org.apache.tika.pipes.core.server.PipesWorker.metadataIsEmpty; +import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.util.List; +import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -28,6 +32,7 @@ import org.slf4j.LoggerFactory; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.filter.IncludeFieldMetadataFilter; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.NoOpFilter; import org.apache.tika.parser.ParseContext; @@ -109,7 +114,10 @@ class EmitHandler { return new PipesResult(PipesResult.RESULT_STATUS.EMITTER_INITIALIZATION_EXCEPTION, ExceptionUtils.getStackTrace(e)); } try { - if (isExtractEmbeddedBytes && + ParseMode parseMode = parseContext.get(ParseMode.class); + if (parseMode == ParseMode.CONTENT_ONLY && emitter instanceof StreamEmitter) { + emitContentOnly((StreamEmitter) emitter, emitKey, parseData, parseContext); + } else if (isExtractEmbeddedBytes && parseData.toBePackagedForStreamEmitter()) { emitContentsAndBytes(emitter, emitKey, parseData); } else { @@ -142,6 +150,23 @@ class EmitHandler { } } + private void emitContentOnly(StreamEmitter emitter, EmitKey emitKey, + MetadataListAndEmbeddedBytes parseData, + ParseContext parseContext) throws IOException { + List<Metadata> metadataList = parseData.getMetadataList(); + String content = ""; + if (metadataList != null && !metadataList.isEmpty()) { + String val = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT); + if (val != null) { + content = val; + } + } + byte[] bytes = content.getBytes(StandardCharsets.UTF_8); + try (InputStream is = new ByteArrayInputStream(bytes)) { + emitter.emit(emitKey.getEmitKey(), is, new Metadata(), parseContext); + } + } + private void emitContentsAndBytes(Emitter emitter, EmitKey emitKey, MetadataListAndEmbeddedBytes parseData) { if (!(emitter instanceof StreamEmitter)) { @@ -210,7 +235,14 @@ class EmitHandler { private void filterMetadata(MetadataListAndEmbeddedBytes parseData, ParseContext parseContext) { MetadataFilter filter = parseContext.get(MetadataFilter.class); if (filter == null) { - filter = defaultMetadataFilter; + ParseMode parseMode = parseContext.get(ParseMode.class); + if (parseMode == ParseMode.CONTENT_ONLY) { + filter = new IncludeFieldMetadataFilter( + Set.of(TikaCoreProperties.TIKA_CONTENT.getName(), + TikaCoreProperties.CONTAINER_EXCEPTION.getName())); + } else { + filter = defaultMetadataFilter; + } } if (filter instanceof NoOpFilter) { return; diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java index a0f89ef3f2..8385631ae4 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java @@ -93,6 +93,11 @@ class ParseHandler { // The difference is in setup (PipesWorker) - UNPACK has mandatory byte extraction metadataList = parseRecursive(fetchEmitTuple, contentHandlerFactory, stream, metadata, parseContext); + } else if (parseMode == ParseMode.CONCATENATE || parseMode == ParseMode.CONTENT_ONLY) { + // CONTENT_ONLY parses identically to CONCATENATE; the difference is + // at emit time where emitters write only the raw content string + metadataList = parseConcatenated(fetchEmitTuple, contentHandlerFactory, stream, metadata, + parseContext); } else { metadataList = parseConcatenated(fetchEmitTuple, contentHandlerFactory, stream, metadata, parseContext); diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java index ee18f7a369..5cef444687 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java @@ -17,6 +17,9 @@ package org.apache.tika.pipes.core; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; import java.nio.charset.StandardCharsets; @@ -35,6 +38,7 @@ import org.apache.tika.metadata.filter.CompositeMetadataFilter; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.PipesResult; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; @@ -720,4 +724,86 @@ public class PipesClientTest { assertEquals("Heartbeat Test", metadata.get("dc:creator")); } } + + @Test + public void testContentOnlyMode(@TempDir Path tmp) throws Exception { + // Test that CONTENT_ONLY mode strips all metadata except X-TIKA:content + PipesClient pipesClient = init(tmp, testDoc); + + ParseContext parseContext = new ParseContext(); + parseContext.set(ParseMode.class, ParseMode.CONTENT_ONLY); + + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple(testDoc, new FetchKey(fetcherName, testDoc), + new EmitKey(), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP)); + assertNotNull(pipesResult.emitData().getMetadataList()); + assertEquals(1, pipesResult.emitData().getMetadataList().size()); + Metadata metadata = pipesResult.emitData().getMetadataList().get(0); + + // Content should be present + String content = metadata.get(TikaCoreProperties.TIKA_CONTENT); + assertNotNull(content, "TIKA_CONTENT should be present in CONTENT_ONLY mode"); + assertFalse(content.isEmpty(), "TIKA_CONTENT should not be empty"); + + // Other metadata should be stripped by the IncludeFieldMetadataFilter + assertNull(metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY), + "RESOURCE_NAME should be stripped in CONTENT_ONLY mode"); + assertNull(metadata.get(Metadata.CONTENT_TYPE), + "CONTENT_TYPE should be stripped in CONTENT_ONLY mode"); + } + + @Test + public void testContentOnlyModeWithUserFilter(@TempDir Path tmp) throws Exception { + // Test that CONTENT_ONLY mode respects a user-provided MetadataFilter + ParseContext parseContext = new ParseContext(); + parseContext.set(ParseMode.class, ParseMode.CONTENT_ONLY); + // Set a user metadata filter via JSON - this should override the default CONTENT_ONLY filter + parseContext.setJsonConfig("metadata-filters", """ + ["mock-upper-case-filter"] + """); + + PipesClient pipesClient = init(tmp, testDoc); + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple(testDoc, new FetchKey(fetcherName, testDoc), + new EmitKey(), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP)); + assertNotNull(pipesResult.emitData().getMetadataList()); + assertEquals(1, pipesResult.emitData().getMetadataList().size()); + Metadata metadata = pipesResult.emitData().getMetadataList().get(0); + + // User filter (uppercase) should take effect instead of CONTENT_ONLY filter + // So all metadata should still be present (but uppercased) + assertEquals("TESTOVERLAPPINGTEXT.PDF", + metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY), + "User filter should take priority over CONTENT_ONLY filter"); + } + + @Test + public void testConcatenateMode(@TempDir Path tmp) throws Exception { + // Test that CONCATENATE mode returns a single metadata object with content + // but preserves all metadata fields (unlike CONTENT_ONLY) + String testFile = "mock-embedded.xml"; + PipesClient pipesClient = init(tmp, testFile); + + ParseContext parseContext = new ParseContext(); + parseContext.set(ParseMode.class, ParseMode.CONCATENATE); + + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple(testFile, new FetchKey(fetcherName, testFile), + new EmitKey(), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP)); + assertNotNull(pipesResult.emitData().getMetadataList()); + // CONCATENATE produces a single metadata object (not one per embedded doc) + assertEquals(1, pipesResult.emitData().getMetadataList().size()); + Metadata metadata = pipesResult.emitData().getMetadataList().get(0); + + // Content should be present + String content = metadata.get(TikaCoreProperties.TIKA_CONTENT); + assertNotNull(content, "TIKA_CONTENT should be present in CONCATENATE mode"); + + // All metadata should still be present (unlike CONTENT_ONLY) + assertNotNull(metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY), + "RESOURCE_NAME should be preserved in CONCATENATE mode"); + } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java index a3cfee1316..e990652b30 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java @@ -117,6 +117,7 @@ public class FileSystemEmitter extends AbstractStreamEmitter { return; } } + if (config.onExists() == FileSystemEmitterConfig.ON_EXISTS.EXCEPTION) { try (Writer writer = Files.newBufferedWriter(output, StandardCharsets.UTF_8, StandardOpenOption.CREATE_NEW)) { //CREATE_NEW forces an IOException if the file already exists @@ -126,7 +127,6 @@ public class FileSystemEmitter extends AbstractStreamEmitter { try (Writer writer = Files.newBufferedWriter(output, StandardCharsets.UTF_8)) { JsonMetadataList.toJson(metadataList, writer, config.prettyPrint()); } - } } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 354331ce38..f396c1aa23 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -764,10 +764,11 @@ public class TikaResource { LOG.debug("produceRawOutput: handlerType={}, contentHandlerFactory={}", handlerTypeName, context.get(ContentHandlerFactory.class)); - // Parse with pipes + // Parse with pipes using CONTENT_ONLY mode - the metadata filter in + // EmitHandler will strip everything except X-TIKA:content List<Metadata> metadataList; try { - metadataList = parseWithPipes(tis, metadata, context, ParseMode.CONCATENATE); + metadataList = parseWithPipes(tis, metadata, context, ParseMode.CONTENT_ONLY); } finally { tis.close(); } @@ -776,6 +777,8 @@ public class TikaResource { // For raw streaming endpoints, throw exception if there was a parse error // (JSON endpoints return exceptions in metadata) + // Note: CONTAINER_EXCEPTION is extracted before the metadata filter runs, + // so it's available in the passback even though the filter strips it if (!metadataList.isEmpty()) { String exception = metadataList.get(0).get(TikaCoreProperties.CONTAINER_EXCEPTION); if (exception != null && !exception.isEmpty()) {
