This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 2ac4feffb8 TIKA-4656 - add content-only parse mode, markdown handler
integration, and docs (#2600)
2ac4feffb8 is described below
commit 2ac4feffb8bee7ba7b619b7d19a1f30d33bd8150
Author: Tim Allison <[email protected]>
AuthorDate: Mon Feb 9 21:22:04 2026 -0500
TIKA-4656 - add content-only parse mode, markdown handler integration, and
docs (#2600)
Co-authored-by: Claude Opus 4.6 <[email protected]>
---
docs/modules/ROOT/nav.adoc | 1 +
docs/modules/ROOT/pages/pipes/index.adoc | 1 +
docs/modules/ROOT/pages/pipes/parse-modes.adoc | 139 +++++++++++++++++++++
docs/modules/ROOT/pages/using-tika/cli/index.adoc | 75 +++++++++++
.../ROOT/pages/using-tika/java-api/index.adoc | 14 ++-
.../ROOT/pages/using-tika/server/index.adoc | 54 ++++++++
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 10 ++
.../org/apache/tika/async/cli/PluginsWriter.java | 31 +++++
.../apache/tika/async/cli/SimpleAsyncConfig.java | 19 ++-
.../org/apache/tika/async/cli/TikaAsyncCLI.java | 14 ++-
.../java/org/apache/tika/pipes/api/ParseMode.java | 16 ++-
.../apache/tika/pipes/core/server/EmitHandler.java | 36 +++++-
.../tika/pipes/core/server/ParseHandler.java | 5 +
.../apache/tika/pipes/core/PipesClientTest.java | 86 +++++++++++++
.../tika/pipes/emitter/fs/FileSystemEmitter.java | 2 +-
.../tika/server/core/resource/TikaResource.java | 7 +-
16 files changed, 499 insertions(+), 11 deletions(-)
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 89ea2c0c8c..eaeddf383f 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -21,6 +21,7 @@
** xref:using-tika/cli/index.adoc[Command Line]
** xref:using-tika/grpc/index.adoc[gRPC]
* xref:pipes/index.adoc[Pipes]
+** xref:pipes/parse-modes.adoc[Parse Modes]
** xref:pipes/unpack-config.adoc[Extracting Embedded Bytes]
* xref:configuration/index.adoc[Configuration]
** xref:configuration/parsers/pdf-parser.adoc[PDF Parser]
diff --git a/docs/modules/ROOT/pages/pipes/index.adoc
b/docs/modules/ROOT/pages/pipes/index.adoc
index 899a427514..ff67ab6e0c 100644
--- a/docs/modules/ROOT/pages/pipes/index.adoc
+++ b/docs/modules/ROOT/pages/pipes/index.adoc
@@ -29,6 +29,7 @@ Tika Pipes provides a framework for processing large volumes
of documents with:
== Topics
+* xref:pipes/parse-modes.adoc[Parse Modes] - Control how documents are parsed
and emitted (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`, `UNPACK`)
* xref:pipes/unpack-config.adoc[Extracting Embedded Bytes] - Extract raw bytes
from embedded documents using `ParseMode.UNPACK`
// Add links to specific topics as they are created
diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc
b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
new file mode 100644
index 0000000000..a023d0b406
--- /dev/null
+++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
@@ -0,0 +1,139 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Parse Modes
+
+Tika Pipes uses `ParseMode` to control how documents are parsed and how
results are emitted.
+The parse mode is set on the `ParseContext` or configured in `PipesConfig`.
+
+== Available Parse Modes
+
+[cols="1,3"]
+|===
+|Mode |Description
+
+|`RMETA`
+|Default mode. Each embedded document produces a separate `Metadata` object.
+Results are returned as a JSON array of metadata objects.
+
+|`CONCATENATE`
+|All content from embedded documents is concatenated into a single content
field.
+Results are returned as a single `Metadata` object with all metadata preserved.
+
+|`CONTENT_ONLY`
+|Parses like `CONCATENATE` but emits only the raw extracted content — no JSON
wrapper,
+no metadata fields. Useful when you want just the text, markdown, or HTML
output.
+
+|`NO_PARSE`
+|Skip parsing entirely. Useful for pipelines that only need to fetch and emit
raw bytes.
+
+|`UNPACK`
+|Extract raw bytes from embedded documents. See
xref:pipes/unpack-config.adoc[Extracting Embedded Bytes].
+|===
+
+== CONCATENATE Mode
+
+`CONCATENATE` merges all content from embedded documents into a single content
field
+while preserving all metadata from parsing:
+
+[source,json]
+----
+{
+ "parseContext": {
+ "parseMode": "CONCATENATE"
+ }
+}
+----
+
+The result is a single `Metadata` object containing the concatenated content in
+`X-TIKA:content` along with all other metadata fields (title, author, content
type, etc.).
+
+== CONTENT_ONLY Mode
+
+`CONTENT_ONLY` is designed for use cases where you want just the extracted
content
+written to storage — no JSON wrapping, no metadata overhead. This is
particularly
+useful for:
+
+* Extracting markdown files from a document corpus
+* Building plain text search indexes
+* Generating HTML versions of documents
+
+[source,json]
+----
+{
+ "parseContext": {
+ "parseMode": "CONTENT_ONLY"
+ }
+}
+----
+
+=== How It Works
+
+1. Documents are parsed identically to `CONCATENATE` mode — all embedded
content is
+ merged into a single content field.
+2. A metadata filter automatically strips all metadata except `X-TIKA:content`
and
+ `X-TIKA:CONTAINER_EXCEPTION` (for error tracking).
+3. When the emitter is a `StreamEmitter` (such as the filesystem or S3
emitter), the
+ raw content string is written directly as bytes — no JSON serialization.
+
+=== Metadata Filtering
+
+By default, `CONTENT_ONLY` mode applies an `IncludeFieldMetadataFilter` that
retains
+only `X-TIKA:content` and `X-TIKA:CONTAINER_EXCEPTION`. If you set your own
+`MetadataFilter` on the `ParseContext`, your filter takes priority.
+
+=== CLI Usage
+
+The `tika-async-cli` batch processor supports `CONTENT_ONLY` via the
`--content-only`
+flag:
+
+[source,bash]
+----
+java -jar tika-async-cli.jar -i /input -o /output -h m --content-only
+----
+
+This produces `.md` files (when using the `m` handler type) containing only the
+extracted markdown content.
+
+=== Content Handler Types
+
+The content format depends on the configured handler type:
+
+[cols="1,1,2"]
+|===
+|Handler |Extension |Description
+
+|`t` (text)
+|`.txt`
+|Plain text output
+
+|`h` (html)
+|`.html`
+|HTML output
+
+|`x` (xml)
+|`.xml`
+|XHTML output
+
+|`m` (markdown)
+|`.md`
+|Markdown output
+
+|`b` (body)
+|`.txt`
+|Body content handler output
+|===
diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc
b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
index 0e19ed6170..17a631e1f8 100644
--- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
@@ -83,6 +83,9 @@ java -jar tika-app.jar [option...] [file|port...]
|`-t` or `--text`
|Output plain text
+|`--md`
+|Output Markdown
+
|`-m` or `--metadata`
|Output metadata only
@@ -124,6 +127,13 @@ Process entire directories by specifying input and output
paths:
java -jar tika-app.jar -i /path/to/input -o /path/to/output
----
+=== Extract Markdown from a file
+
+[source,bash]
+----
+java -jar tika-app.jar --md document.docx
+----
+
=== Custom configuration
Use a custom configuration file:
@@ -132,3 +142,68 @@ Use a custom configuration file:
----
java -jar tika-app.jar --config=tika-config.json document.pdf
----
+
+== Batch Processing (tika-async-cli)
+
+For processing large numbers of files, use `tika-async-cli`. It uses the Tika
Pipes
+architecture with forked JVM processes for fault tolerance.
+
+=== Basic Batch Usage
+
+[source,bash]
+----
+java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output
+----
+
+This processes all files in the input directory and writes JSON metadata
(RMETA format)
+to the output directory.
+
+=== Batch Options
+
+[cols="1,3"]
+|===
+|Option |Description
+
+|`-i`
+|Input directory
+
+|`-o`
+|Output directory
+
+|`-h` or `--handlerType`
+|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body,
`i`=ignore (default: `t`)
+
+|`--concatenate`
+|Concatenate content from all embedded documents into a single content field
+
+|`--content-only`
+|Output only extracted content (no metadata, no JSON wrapper); implies
`--concatenate`
+
+|`-T` or `--timeoutMs`
+|Timeout for each parse in milliseconds
+
+|`-n` or `--numClients`
+|Number of parallel forked processes
+
+|`-p` or `--pluginsDir`
+|Plugins directory
+|===
+
+=== Batch Examples
+
+Extract markdown content only (no metadata) from all files:
+
+[source,bash]
+----
+java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output -h m
--content-only
+----
+
+This produces `.md` files in the output directory containing just the
extracted markdown
+content — no JSON wrappers, no metadata fields.
+
+Extract text with all metadata in concatenated mode:
+
+[source,bash]
+----
+java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output --concatenate
+----
diff --git a/docs/modules/ROOT/pages/using-tika/java-api/index.adoc
b/docs/modules/ROOT/pages/using-tika/java-api/index.adoc
index 5b2ff99930..4853446d50 100644
--- a/docs/modules/ROOT/pages/using-tika/java-api/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/java-api/index.adoc
@@ -100,12 +100,24 @@ For example, use `TikaInputStream.get(path)` for a
`Path`, or `TikaInputStream.g
for a `byte[]`. This allows Tika to access the underlying resource efficiently
and enables
features like mark/reset support that many parsers and detectors require.
-=== Utility Classes
+=== Content Handlers
+
+Tika provides several content handlers that control the output format:
**BodyContentHandler**:: Extracts and converts the body content to streams or
strings.
+**ToTextContentHandler**:: Outputs plain text.
+
+**ToHTMLContentHandler**:: Outputs HTML.
+
+**ToXMLContentHandler**:: Outputs XHTML/XML.
+
+**ToMarkdownContentHandler**:: Outputs Markdown, preserving structural
semantics like headings, lists, tables, code blocks, emphasis, and links.
+
**ParsingReader**:: Uses background threading to return extracted text as
character streams.
+Use `BasicContentHandlerFactory` to create handlers by type: `TEXT`, `HTML`,
`XML`, `BODY`, `MARKDOWN`, `IGNORE`.
+
=== Key Metadata Properties
* `TikaCoreProperties.RESOURCE_NAME_KEY` - filename or resource identifier
diff --git a/docs/modules/ROOT/pages/using-tika/server/index.adoc
b/docs/modules/ROOT/pages/using-tika/server/index.adoc
index dbb086b5e6..1eee6a448f 100644
--- a/docs/modules/ROOT/pages/using-tika/server/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/server/index.adoc
@@ -33,6 +33,60 @@ java -jar tika-server-standard.jar
The server starts on port 9998 by default.
+== Endpoints
+
+=== Content Extraction (`/tika`)
+
+The `/tika` endpoint extracts content from a document as plain text.
+
+[source,bash]
+----
+curl -T document.pdf http://localhost:9998/tika
+----
+
+==== Markdown Output (`/tika/md`)
+
+The `/tika/md` endpoint extracts content as Markdown, preserving structural
semantics
+like headings, lists, tables, and emphasis:
+
+[source,bash]
+----
+curl -T document.docx http://localhost:9998/tika/md
+----
+
+==== Custom Handler Type
+
+Use the `X-Tika-Handler` header to control the output format. Valid values:
`text` (default),
+`html`, `xml`, `markdown`, `ignore`.
+
+[source,bash]
+----
+curl -T document.pdf -H "X-Tika-Handler: markdown" http://localhost:9998/tika
+----
+
+=== Recursive Metadata (`/rmeta`)
+
+The `/rmeta` endpoint returns metadata for the container document and all
embedded documents
+as a JSON array of metadata objects.
+
+[source,bash]
+----
+curl -T document.pdf http://localhost:9998/rmeta
+----
+
+Content handler can be specified in the URL path:
+
+* `/rmeta/text` - plain text content (default)
+* `/rmeta/html` - HTML content
+* `/rmeta/xml` - XHTML content
+* `/rmeta/markdown` - Markdown content
+* `/rmeta/ignore` - metadata only, no content
+
+[source,bash]
+----
+curl -T document.docx http://localhost:9998/rmeta/markdown
+----
+
== Topics
* xref:using-tika/server/tls.adoc[TLS/SSL Configuration] - Secure your server
with TLS and mutual authentication
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index dfcc299520..4fbedb8db9 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -100,6 +100,7 @@ import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToMarkdownContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
import org.apache.tika.serialization.JsonMetadata;
@@ -225,6 +226,12 @@ public class TikaCLI {
* Fork mode plugins directory.
*/
private String forkPluginsDir = null;
+ private final OutputType MARKDOWN = new OutputType() {
+ @Override
+ protected ContentHandler getContentHandler(OutputStream output,
Metadata metadata) throws Exception {
+ return new BodyContentHandler(new
ToMarkdownContentHandler(getOutputWriter(output, encoding)));
+ }
+ };
private final OutputType XML = new OutputType() {
@Override
protected ContentHandler getContentHandler(OutputStream output,
Metadata metadata) throws Exception {
@@ -483,6 +490,8 @@ public class TikaCLI {
type = XML;
} else if (arg.equals("-h") || arg.equals("--html")) {
type = HTML;
+ } else if (arg.equals("--md")) {
+ type = MARKDOWN;
} else if (arg.equals("-t") || arg.equals("--text")) {
type = TEXT;
} else if (arg.equals("-T") || arg.equals("--text-main")) {
@@ -744,6 +753,7 @@ public class TikaCLI {
out.println(" -x or --xml Output XHTML content
(default)");
out.println(" -h or --html Output HTML content");
out.println(" -t or --text Output plain text content
(body)");
+ out.println(" --md Output Markdown content
(body)");
out.println(" -T or --text-main Output plain text content
(main content only via boilerpipe handler)");
out.println(" -A or --text-all Output all text content");
out.println(" -m or --metadata Output only metadata");
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
index 1257c48e4c..dd2b02d067 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
@@ -27,7 +27,9 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import org.apache.tika.config.loader.TikaObjectMapperFactory;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.core.PipesConfig;
+import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.utils.StringUtils;
public class PluginsWriter {
@@ -101,6 +103,24 @@ public class PluginsWriter {
if (simpleAsyncConfig.getTimeoutMs() != null) {
pipesConfig.setTimeoutMillis(simpleAsyncConfig.getTimeoutMs());
}
+ if (simpleAsyncConfig.isContentOnly()) {
+ pipesConfig.setParseMode(ParseMode.CONTENT_ONLY);
+ } else if (simpleAsyncConfig.isConcatenate()) {
+ pipesConfig.setParseMode(ParseMode.CONCATENATE);
+ }
+
+ // For content-only mode, change the emitter file extension based
on handler type
+ if (simpleAsyncConfig.isContentOnly()) {
+ String ext =
getFileExtensionForHandlerType(simpleAsyncConfig.getHandlerType());
+ if (emitters != null && emitters.has("fse")) {
+ ObjectNode fse = (ObjectNode) emitters.get("fse");
+ if (fse != null && fse.has("file-system-emitter")) {
+ ObjectNode fsEmitter = (ObjectNode)
fse.get("file-system-emitter");
+ fsEmitter.put("fileExtension", ext);
+ }
+ }
+ }
+
root.set("pipes", objectMapper.valueToTree(pipesConfig));
objectMapper.writerWithDefaultPrettyPrinter().writeValue(output.toFile(), root);
@@ -108,4 +128,15 @@ public class PluginsWriter {
throw new IOException(e);
}
}
+
+ private static String getFileExtensionForHandlerType(
+ BasicContentHandlerFactory.HANDLER_TYPE handlerType) {
+ return switch (handlerType) {
+ case MARKDOWN -> "md";
+ case HTML -> "html";
+ case XML -> "xml";
+ case BODY, TEXT -> "txt";
+ default -> "txt";
+ };
+ }
}
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
index 0abefc9267..5ea5e764ba 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
@@ -37,6 +37,10 @@ class SimpleAsyncConfig {
private final BasicContentHandlerFactory.HANDLER_TYPE handlerType;
private final String pluginsDir;
+ // Parse mode options
+ private final boolean concatenate;
+ private final boolean contentOnly;
+
// Frictionless Data Package options
private final String unpackFormat; // "REGULAR" or "FRICTIONLESS"
private final String unpackMode; // "ZIPPED" or "DIRECTORY"
@@ -47,12 +51,13 @@ class SimpleAsyncConfig {
String tikaConfig,
BasicContentHandlerFactory.HANDLER_TYPE handlerType,
ExtractBytesMode extractBytesMode, String
pluginsDir) {
this(inputDir, outputDir, numClients, timeoutMs, xmx, fileList,
tikaConfig, handlerType,
- extractBytesMode, pluginsDir, null, null, false);
+ extractBytesMode, pluginsDir, false, false, null, null, false);
}
public SimpleAsyncConfig(String inputDir, String outputDir, Integer
numClients, Long timeoutMs, String xmx, String fileList,
String tikaConfig,
BasicContentHandlerFactory.HANDLER_TYPE handlerType,
ExtractBytesMode extractBytesMode, String
pluginsDir,
+ boolean concatenate, boolean contentOnly,
String unpackFormat, String unpackMode, boolean
unpackIncludeMetadata) {
this.inputDir = inputDir;
this.outputDir = outputDir;
@@ -64,6 +69,8 @@ class SimpleAsyncConfig {
this.handlerType = handlerType;
this.extractBytesMode = extractBytesMode;
this.pluginsDir = pluginsDir;
+ this.concatenate = concatenate;
+ this.contentOnly = contentOnly;
this.unpackFormat = unpackFormat;
this.unpackMode = unpackMode;
this.unpackIncludeMetadata = unpackIncludeMetadata;
@@ -109,6 +116,14 @@ class SimpleAsyncConfig {
return pluginsDir;
}
+ public boolean isConcatenate() {
+ return concatenate;
+ }
+
+ public boolean isContentOnly() {
+ return contentOnly;
+ }
+
public String getUnpackFormat() {
return unpackFormat;
}
@@ -134,6 +149,8 @@ class SimpleAsyncConfig {
", extractBytesMode=" + extractBytesMode +
", handlerType=" + handlerType +
", pluginsDir='" + pluginsDir + '\'' +
+ ", concatenate=" + concatenate +
+ ", contentOnly=" + contentOnly +
", unpackFormat='" + unpackFormat + '\'' +
", unpackMode='" + unpackMode + '\'' +
", unpackIncludeMetadata=" + unpackIncludeMetadata +
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
index 4687845389..a28e0b26f9 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -67,12 +67,14 @@ public class TikaAsyncCLI {
options.addOption("X", "Xmx", true, "heap for the forked clients in
usual jvm heap amount, e.g. -X 1g");
options.addOption("?", "help", false, "this help message");
options.addOption("T", "timeoutMs", true, "timeout for each parse in
milliseconds");
- options.addOption("h", "handlerType", true, "handler type: t=text,
h=html, x=xml, b=body, i=ignore");
+ options.addOption("h", "handlerType", true, "handler type: t=text,
h=html, x=xml, m=markdown, b=body, i=ignore");
options.addOption("p", "pluginsDir", true, "plugins directory");
//options.addOption("l", "fileList", true, "file list");
options.addOption("c", "config", true, "tikaConfig.json");
options.addOption("z", "unzipShallow", false, "extract raw bytes from
direct attachments only (depth=1)");
options.addOption("Z", "unzipRecursive", false, "extract raw bytes
from all attachments recursively");
+ options.addOption(null, "concatenate", false, "concatenate content
from all embedded documents into a single content field");
+ options.addOption(null, "content-only", false, "output only extracted
content (no metadata, no JSON wrapper); implies --concatenate");
options.addOption(null, "unpack-format", true,
"output format for unpacking: REGULAR (default) or
FRICTIONLESS");
options.addOption(null, "unpack-mode", true,
@@ -224,6 +226,10 @@ public class TikaAsyncCLI {
pluginsDir = line.getOptionValue('p');
}
+ // Parse mode options
+ boolean contentOnly = line.hasOption("content-only");
+ boolean concatenate = line.hasOption("concatenate") || contentOnly;
+
// Frictionless Data Package options
String unpackFormat = null;
String unpackMode = null;
@@ -282,17 +288,19 @@ public class TikaAsyncCLI {
return new SimpleAsyncConfig(inputDir, outputDir,
numClients, timeoutMs, xmx, fileList, tikaConfig, handlerType,
- extractBytesMode, pluginsDir, unpackFormat, unpackMode,
unpackIncludeMetadata);
+ extractBytesMode, pluginsDir, concatenate, contentOnly,
+ unpackFormat, unpackMode, unpackIncludeMetadata);
}
private static BasicContentHandlerFactory.HANDLER_TYPE
getHandlerType(String t) throws TikaConfigException {
return switch (t) {
case "x" -> BasicContentHandlerFactory.HANDLER_TYPE.XML;
case "h" -> BasicContentHandlerFactory.HANDLER_TYPE.HTML;
+ case "m" -> BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN;
case "b" -> BasicContentHandlerFactory.HANDLER_TYPE.BODY;
case "i" -> BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
case "t" -> BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
- default -> throw new TikaConfigException("Can't understand " + t +
" as a handler type. Must be one of: x(ml), h(tml), b(ody), i(gnore), t(ext)");
+ default -> throw new TikaConfigException("Can't understand " + t +
" as a handler type. Must be one of: x(ml), h(tml), m(arkdown), b(ody),
i(gnore), t(ext)");
};
}
diff --git
a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
index 3513de3b55..e6127d5005 100644
---
a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
+++
b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
@@ -53,6 +53,20 @@ public enum ParseMode {
*/
NO_PARSE,
+ /**
+ * Concatenates content and emits only the raw content string, with no
+ * metadata and no JSON wrapper.
+ * <p>
+ * This mode parses like CONCATENATE (producing a single metadata object
with
+ * merged content from all embedded documents), but at emit time, emitters
+ * write only the value of {@code X-TIKA:content} as a raw string instead
of
+ * serializing the full metadata list as JSON.
+ * <p>
+ * This is useful when you want plain text, markdown, or HTML output files
+ * without any metadata overhead.
+ */
+ CONTENT_ONLY,
+
/**
* Extracts embedded document bytes and emits them, with full RMETA
metadata.
* <p>
@@ -88,7 +102,7 @@ public enum ParseMode {
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(
"Invalid parse mode: '" + modeString + "'. " +
- "Must be one of: RMETA, CONCATENATE, NO_PARSE,
UNPACK");
+ "Must be one of: RMETA, CONCATENATE, CONTENT_ONLY,
NO_PARSE, UNPACK");
}
}
}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java
index cea86da760..e60f1f5c84 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java
@@ -19,8 +19,12 @@ package org.apache.tika.pipes.core.server;
import static org.apache.tika.pipes.core.server.PipesWorker.metadataIsEmpty;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
import java.util.List;
+import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -28,6 +32,7 @@ import org.slf4j.LoggerFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.IncludeFieldMetadataFilter;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.filter.NoOpFilter;
import org.apache.tika.parser.ParseContext;
@@ -109,7 +114,10 @@ class EmitHandler {
return new
PipesResult(PipesResult.RESULT_STATUS.EMITTER_INITIALIZATION_EXCEPTION,
ExceptionUtils.getStackTrace(e));
}
try {
- if (isExtractEmbeddedBytes &&
+ ParseMode parseMode = parseContext.get(ParseMode.class);
+ if (parseMode == ParseMode.CONTENT_ONLY && emitter instanceof
StreamEmitter) {
+ emitContentOnly((StreamEmitter) emitter, emitKey, parseData,
parseContext);
+ } else if (isExtractEmbeddedBytes &&
parseData.toBePackagedForStreamEmitter()) {
emitContentsAndBytes(emitter, emitKey, parseData);
} else {
@@ -142,6 +150,23 @@ class EmitHandler {
}
}
+ private void emitContentOnly(StreamEmitter emitter, EmitKey emitKey,
+ MetadataListAndEmbeddedBytes parseData,
+ ParseContext parseContext) throws
IOException {
+ List<Metadata> metadataList = parseData.getMetadataList();
+ String content = "";
+ if (metadataList != null && !metadataList.isEmpty()) {
+ String val =
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
+ if (val != null) {
+ content = val;
+ }
+ }
+ byte[] bytes = content.getBytes(StandardCharsets.UTF_8);
+ try (InputStream is = new ByteArrayInputStream(bytes)) {
+ emitter.emit(emitKey.getEmitKey(), is, new Metadata(),
parseContext);
+ }
+ }
+
private void emitContentsAndBytes(Emitter emitter, EmitKey emitKey,
MetadataListAndEmbeddedBytes parseData) {
if (!(emitter instanceof StreamEmitter)) {
@@ -210,7 +235,14 @@ class EmitHandler {
private void filterMetadata(MetadataListAndEmbeddedBytes parseData,
ParseContext parseContext) {
MetadataFilter filter = parseContext.get(MetadataFilter.class);
if (filter == null) {
- filter = defaultMetadataFilter;
+ ParseMode parseMode = parseContext.get(ParseMode.class);
+ if (parseMode == ParseMode.CONTENT_ONLY) {
+ filter = new IncludeFieldMetadataFilter(
+ Set.of(TikaCoreProperties.TIKA_CONTENT.getName(),
+
TikaCoreProperties.CONTAINER_EXCEPTION.getName()));
+ } else {
+ filter = defaultMetadataFilter;
+ }
}
if (filter instanceof NoOpFilter) {
return;
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
index a0f89ef3f2..8385631ae4 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
@@ -93,6 +93,11 @@ class ParseHandler {
// The difference is in setup (PipesWorker) - UNPACK has mandatory
byte extraction
metadataList =
parseRecursive(fetchEmitTuple, contentHandlerFactory,
stream, metadata, parseContext);
+ } else if (parseMode == ParseMode.CONCATENATE || parseMode ==
ParseMode.CONTENT_ONLY) {
+ // CONTENT_ONLY parses identically to CONCATENATE; the difference
is
+ // at emit time where emitters write only the raw content string
+ metadataList = parseConcatenated(fetchEmitTuple,
contentHandlerFactory, stream, metadata,
+ parseContext);
} else {
metadataList = parseConcatenated(fetchEmitTuple,
contentHandlerFactory, stream, metadata,
parseContext);
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
index ee18f7a369..5cef444687 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
@@ -17,6 +17,9 @@
package org.apache.tika.pipes.core;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.nio.charset.StandardCharsets;
@@ -35,6 +38,7 @@ import
org.apache.tika.metadata.filter.CompositeMetadataFilter;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.api.PipesResult;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
@@ -720,4 +724,86 @@ public class PipesClientTest {
assertEquals("Heartbeat Test", metadata.get("dc:creator"));
}
}
+
+ @Test
+ public void testContentOnlyMode(@TempDir Path tmp) throws Exception {
+ // Test that CONTENT_ONLY mode strips all metadata except
X-TIKA:content
+ PipesClient pipesClient = init(tmp, testDoc);
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(ParseMode.class, ParseMode.CONTENT_ONLY);
+
+ PipesResult pipesResult = pipesClient.process(
+ new FetchEmitTuple(testDoc, new FetchKey(fetcherName, testDoc),
+ new EmitKey(), new Metadata(), parseContext,
+ FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
+ assertNotNull(pipesResult.emitData().getMetadataList());
+ assertEquals(1, pipesResult.emitData().getMetadataList().size());
+ Metadata metadata = pipesResult.emitData().getMetadataList().get(0);
+
+ // Content should be present
+ String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+ assertNotNull(content, "TIKA_CONTENT should be present in CONTENT_ONLY
mode");
+ assertFalse(content.isEmpty(), "TIKA_CONTENT should not be empty");
+
+ // Other metadata should be stripped by the IncludeFieldMetadataFilter
+ assertNull(metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY),
+ "RESOURCE_NAME should be stripped in CONTENT_ONLY mode");
+ assertNull(metadata.get(Metadata.CONTENT_TYPE),
+ "CONTENT_TYPE should be stripped in CONTENT_ONLY mode");
+ }
+
+ @Test
+ public void testContentOnlyModeWithUserFilter(@TempDir Path tmp) throws
Exception {
+ // Test that CONTENT_ONLY mode respects a user-provided MetadataFilter
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(ParseMode.class, ParseMode.CONTENT_ONLY);
+ // Set a user metadata filter via JSON - this should override the
default CONTENT_ONLY filter
+ parseContext.setJsonConfig("metadata-filters", """
+ ["mock-upper-case-filter"]
+ """);
+
+ PipesClient pipesClient = init(tmp, testDoc);
+ PipesResult pipesResult = pipesClient.process(
+ new FetchEmitTuple(testDoc, new FetchKey(fetcherName, testDoc),
+ new EmitKey(), new Metadata(), parseContext,
+ FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
+ assertNotNull(pipesResult.emitData().getMetadataList());
+ assertEquals(1, pipesResult.emitData().getMetadataList().size());
+ Metadata metadata = pipesResult.emitData().getMetadataList().get(0);
+
+ // User filter (uppercase) should take effect instead of CONTENT_ONLY
filter
+ // So all metadata should still be present (but uppercased)
+ assertEquals("TESTOVERLAPPINGTEXT.PDF",
+ metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY),
+ "User filter should take priority over CONTENT_ONLY filter");
+ }
+
+ @Test
+ public void testConcatenateMode(@TempDir Path tmp) throws Exception {
+ // Test that CONCATENATE mode returns a single metadata object with
content
+ // but preserves all metadata fields (unlike CONTENT_ONLY)
+ String testFile = "mock-embedded.xml";
+ PipesClient pipesClient = init(tmp, testFile);
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(ParseMode.class, ParseMode.CONCATENATE);
+
+ PipesResult pipesResult = pipesClient.process(
+ new FetchEmitTuple(testFile, new FetchKey(fetcherName,
testFile),
+ new EmitKey(), new Metadata(), parseContext,
+ FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
+ assertNotNull(pipesResult.emitData().getMetadataList());
+ // CONCATENATE produces a single metadata object (not one per embedded
doc)
+ assertEquals(1, pipesResult.emitData().getMetadataList().size());
+ Metadata metadata = pipesResult.emitData().getMetadataList().get(0);
+
+ // Content should be present
+ String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+ assertNotNull(content, "TIKA_CONTENT should be present in CONCATENATE
mode");
+
+ // All metadata should still be present (unlike CONTENT_ONLY)
+ assertNotNull(metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY),
+ "RESOURCE_NAME should be preserved in CONCATENATE mode");
+ }
}
diff --git
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
index a3cfee1316..e990652b30 100644
---
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
+++
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
@@ -117,6 +117,7 @@ public class FileSystemEmitter extends
AbstractStreamEmitter {
return;
}
}
+
if (config.onExists() == FileSystemEmitterConfig.ON_EXISTS.EXCEPTION) {
try (Writer writer = Files.newBufferedWriter(output,
StandardCharsets.UTF_8,
StandardOpenOption.CREATE_NEW)) { //CREATE_NEW forces an
IOException if the file already exists
@@ -126,7 +127,6 @@ public class FileSystemEmitter extends
AbstractStreamEmitter {
try (Writer writer = Files.newBufferedWriter(output,
StandardCharsets.UTF_8)) {
JsonMetadataList.toJson(metadataList, writer,
config.prettyPrint());
}
-
}
}
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 354331ce38..f396c1aa23 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -764,10 +764,11 @@ public class TikaResource {
LOG.debug("produceRawOutput: handlerType={}, contentHandlerFactory={}",
handlerTypeName, context.get(ContentHandlerFactory.class));
- // Parse with pipes
+ // Parse with pipes using CONTENT_ONLY mode - the metadata filter in
+ // EmitHandler will strip everything except X-TIKA:content
List<Metadata> metadataList;
try {
- metadataList = parseWithPipes(tis, metadata, context,
ParseMode.CONCATENATE);
+ metadataList = parseWithPipes(tis, metadata, context,
ParseMode.CONTENT_ONLY);
} finally {
tis.close();
}
@@ -776,6 +777,8 @@ public class TikaResource {
// For raw streaming endpoints, throw exception if there was a parse
error
// (JSON endpoints return exceptions in metadata)
+ // Note: CONTAINER_EXCEPTION is extracted before the metadata filter
runs,
+ // so it's available in the passback even though the filter strips it
if (!metadataList.isEmpty()) {
String exception =
metadataList.get(0).get(TikaCoreProperties.CONTAINER_EXCEPTION);
if (exception != null && !exception.isEmpty()) {