This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4663-markdown-in-tika-batch in repository https://gitbox.apache.org/repos/asf/tika.git
commit 01faead43f106c35f45e1fce42ccb1d78464b6c9 Author: tallison <[email protected]> AuthorDate: Sun Feb 22 11:40:29 2026 -0500 TIKA-4663 -- add cli option for markdown in 3.x to include tika-batch --- .../main/java/org/apache/tika/cli/BatchCommandLineBuilder.java | 3 +++ .../java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java | 10 ++++++++++ .../java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java | 8 ++++++++ .../tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java | 3 +++ .../org/apache/tika/batch/fs/default-tika-batch-config.xml | 2 +- 5 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java b/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java index ee88595e23..108326498c 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java +++ b/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java @@ -203,6 +203,9 @@ class BatchCommandLineBuilder { map.remove("-T"); map.remove("--text-main"); map.put("-basicHandlerType", "body"); + } else if (map.containsKey("--md")) { + map.remove("--md"); + map.put("-basicHandlerType", "markdown"); } if (map.containsKey("-J") || map.containsKey("--jsonRecursive")) { diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java index d25b35cfd7..3359f75db7 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java @@ -173,6 +173,16 @@ public class TikaCLIBatchCommandLineTest { } + @Test + public void testMarkdownMapping() throws Exception { + String[] params = {"-i", testInputPathForCommandLine, "-o", "outputRoot", "--md"}; + String[] commandLine = BatchCommandLineBuilder.build(params); + Map<String, String> attrs = mapify(commandLine); + assertEquals("markdown", attrs.get("-basicHandlerType")); + assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir")); + assertEquals("outputRoot", attrs.get("-outputDir")); + } + @Test public void testOneDirOneFileException() throws Exception { boolean ex = false; diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java index 15557488b1..10649e6e44 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java @@ -158,6 +158,14 @@ public class TikaCLIBatchIntegrationTest { } } + @Test + public void testMarkdownBatchIntegration() throws Exception { + String[] params = {"-i", testInputDirForCommandLine, "-o", tempOutputDirForCommandLine, "-numConsumers", "2", "--md"}; + TikaCLI.main(params); + assertFileExists(tempOutputDir.resolve("bad_xml.xml.md")); + assertFileExists(tempOutputDir.resolve("coffee.xls.md")); + } + @Test public void testProcessLogFileConfig() throws Exception { String[] params = {"-i", testInputDirForCommandLine, "-o", tempOutputDirForCommandLine, "-numConsumers", "2", "-JDlog4j.configurationFile=" + customBatchLogging.toUri()}; diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java index 88a3bd085c..0a0d87fce5 100644 --- a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java +++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java @@ -268,6 +268,9 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder { case HTML: sb.append("html"); break; + case MARKDOWN: + sb.append("md"); + break; default: sb.append("txt"); } diff --git a/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml b/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml index 51cbe697e8..36fe1460ec 100644 --- a/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml +++ b/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml @@ -67,7 +67,7 @@ <option opt="handleExisting" hasArg="true" description="if an output file already exists, do you want to: overwrite, rename or skip"/> <option opt="basicHandlerType" hasArg="true" - description="what type of content handler: xml, text, html, body"/> + description="what type of content handler: xml, text, html, body, markdown/md"/> <option opt="outputSuffix" hasArg="true" description="suffix to add to the end of the output file name"/> <option opt="timeoutThresholdMillis" hasArg="true"
