This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4663-tika-app-3x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6f5eda263700e5d1e6f73e2ec3c26b594f16fe88
Author: tallison <[email protected]>
AuthorDate: Wed Feb 18 14:21:08 2026 -0500

    TIKA-4663 -- add cli option for markdown in 3.x
---
 tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java  | 16 ++++++++++++++--
 .../src/test/java/org/apache/tika/cli/TikaCLITest.java   | 12 ++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index bbdd83d11d..37544c59c0 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -101,6 +101,7 @@ import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerFactory;
 import org.apache.tika.sax.ExpandedTitleContentHandler;
 import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToMarkdownContentHandler;
 import org.apache.tika.sax.WriteOutContentHandler;
 import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
 import org.apache.tika.serialization.JsonMetadata;
@@ -203,6 +204,12 @@ public class TikaCLI {
     private boolean pipeMode = true;
     private boolean fork = false;
     private boolean prettyPrint;
+    private final OutputType MARKDOWN = new OutputType() {
+        @Override
+        protected ContentHandler getContentHandler(OutputStream output, 
Metadata metadata) throws Exception {
+            return new BodyContentHandler(new 
ToMarkdownContentHandler(getOutputWriter(output, encoding)));
+        }
+    };
     private final OutputType XML = new OutputType() {
         @Override
         protected ContentHandler getContentHandler(OutputStream output, 
Metadata metadata) throws Exception {
@@ -405,6 +412,8 @@ public class TikaCLI {
             type = XML;
         } else if (arg.equals("-h") || arg.equals("--html")) {
             type = HTML;
+        } else if (arg.equals("--md")) {
+            type = MARKDOWN;
         } else if (arg.equals("-t") || arg.equals("--text")) {
             type = TEXT;
         } else if (arg.equals("-T") || arg.equals("--text-main")) {
@@ -500,6 +509,8 @@ public class TikaCLI {
             handlerType = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
         } else if (type.equals(TEXT_MAIN)) {
             handlerType = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
+        } else if (type.equals(MARKDOWN)) {
+            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN;
         } else if (type.equals(METADATA)) {
             handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
         }
@@ -530,12 +541,13 @@ public class TikaCLI {
         out.println("    -t  or --text          Output plain text content 
(body)");
         out.println("    -T  or --text-main     Output plain text content 
(main content only via boilerpipe handler)");
         out.println("    -A  or --text-all      Output all text content");
-        out.println("    -m  or --metadata      Output only metadata");
+        out.println("    --md                   Output Markdown content 
(body)");
+        out.println("    -m  or --metadata      Output only metadata (no 
content)");
         out.println("    -j  or --json          Output metadata in JSON");
         out.println("    -y  or --xmp           Output metadata in XMP");
         out.println("    -J  or --jsonRecursive Output metadata and content 
from all");
         out.println("                           embedded files (choose content 
type");
-        out.println("                           with -x, -h, -t or -m; default 
is -x)");
+        out.println("                           with -x, -h, --md, -t or -m; 
default is -x)");
         out.println("    -a  or --async         Run Tika in async mode; must 
specify details in a" + " tikaConfig file");
         out.println("    -l  or --language      Output only language");
         out.println("    -d  or --detect        Detect document type");
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 33e311154a..8169ca7f89 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -170,6 +170,18 @@ public class TikaCLITest {
         assertTrue(content.contains("finished off the cake"));
     }
 
+    /**
+     * Tests --md option of the cli
+     *
+     * @throws Exception
+     */
+    @Test
+    public void testMarkdownOutput() throws Exception {
+        String content = getParamOutContent("--md", resourcePrefix + 
"coffee.xls");
+        assertTrue(content.contains("# Sheet1"), "Expected markdown heading");
+        assertTrue(content.contains("| ---"), "Expected markdown table 
separator");
+    }
+
     /**
      * Tests -A option of the cli
      *

Reply via email to