This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 72b6173b88 TIKA-4663 -- add cli option for markdown in 3.x (#2619)
72b6173b88 is described below
commit 72b6173b88843050bcf28f4d0e35609aef2d4382
Author: Tim Allison <[email protected]>
AuthorDate: Wed Feb 18 16:05:59 2026 -0500
TIKA-4663 -- add cli option for markdown in 3.x (#2619)
---
tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java | 16 ++++++++++++++--
.../src/test/java/org/apache/tika/cli/TikaCLITest.java | 12 ++++++++++++
2 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index bbdd83d11d..37544c59c0 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -101,6 +101,7 @@ import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToMarkdownContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
import org.apache.tika.serialization.JsonMetadata;
@@ -203,6 +204,12 @@ public class TikaCLI {
private boolean pipeMode = true;
private boolean fork = false;
private boolean prettyPrint;
+ private final OutputType MARKDOWN = new OutputType() {
+ @Override
+ protected ContentHandler getContentHandler(OutputStream output,
Metadata metadata) throws Exception {
+ return new BodyContentHandler(new
ToMarkdownContentHandler(getOutputWriter(output, encoding)));
+ }
+ };
private final OutputType XML = new OutputType() {
@Override
protected ContentHandler getContentHandler(OutputStream output,
Metadata metadata) throws Exception {
@@ -405,6 +412,8 @@ public class TikaCLI {
type = XML;
} else if (arg.equals("-h") || arg.equals("--html")) {
type = HTML;
+ } else if (arg.equals("--md")) {
+ type = MARKDOWN;
} else if (arg.equals("-t") || arg.equals("--text")) {
type = TEXT;
} else if (arg.equals("-T") || arg.equals("--text-main")) {
@@ -500,6 +509,8 @@ public class TikaCLI {
handlerType = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
} else if (type.equals(TEXT_MAIN)) {
handlerType = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
+ } else if (type.equals(MARKDOWN)) {
+ handlerType = BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN;
} else if (type.equals(METADATA)) {
handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
}
@@ -530,12 +541,13 @@ public class TikaCLI {
out.println(" -t or --text Output plain text content
(body)");
out.println(" -T or --text-main Output plain text content
(main content only via boilerpipe handler)");
out.println(" -A or --text-all Output all text content");
- out.println(" -m or --metadata Output only metadata");
+ out.println(" --md Output Markdown content
(body)");
+ out.println(" -m or --metadata Output only metadata (no
content)");
out.println(" -j or --json Output metadata in JSON");
out.println(" -y or --xmp Output metadata in XMP");
out.println(" -J or --jsonRecursive Output metadata and content
from all");
out.println(" embedded files (choose content
type");
- out.println(" with -x, -h, -t or -m; default
is -x)");
+ out.println(" with -x, -h, --md, -t or -m;
default is -x)");
out.println(" -a or --async Run Tika in async mode; must
specify details in a" + " tikaConfig file");
out.println(" -l or --language Output only language");
out.println(" -d or --detect Detect document type");
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 33e311154a..8169ca7f89 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -170,6 +170,18 @@ public class TikaCLITest {
assertTrue(content.contains("finished off the cake"));
}
+ /**
+ * Tests --md option of the cli
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testMarkdownOutput() throws Exception {
+ String content = getParamOutContent("--md", resourcePrefix +
"coffee.xls");
+ assertTrue(content.contains("# Sheet1"), "Expected markdown heading");
+ assertTrue(content.contains("| ---"), "Expected markdown table
separator");
+ }
+
/**
* Tests -A option of the cli
*