(tika) 02/02: TIKA-4653 - add markdown contenthandler

tallison Mon, 09 Feb 2026 03:29:17 -0800

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4653-markdown-handler
in repository https://gitbox.apache.org/repos/asf/tika.git


commit a637abb926bbe39c84cf17d4969b1ba675844959
Author: tallison <[email protected]>
AuthorDate: Mon Feb 9 06:28:55 2026 -0500

    TIKA-4653 - add markdown contenthandler
---
 .../apache/tika/sax/ToMarkdownContentHandler.java  | 542 ++++++++++++
 .../tika/sax/ToMarkdownContentHandlerTest.java     | 941 +++++++++++++++++++++
 .../core/resource/RecursiveMetadataResource.java   |  18 +-
 .../tika/server/core/resource/TikaResource.java    |  41 +-
 .../standard/RecursiveMetadataResourceTest.java    |  38 +
 .../tika/server/standard/TikaResourceTest.java     |  14 +
 6 files changed, 1584 insertions(+), 10 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java 
b/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java
new file mode 100644
index 0000000000..34e5e96cef
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.StringWriter;
+import java.io.UnsupportedEncodingException;
+import java.io.Writer;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Deque;
+import java.util.List;
+import java.util.Locale;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * SAX event handler that writes content as Markdown.
+ * Supports headings, paragraphs, bold, italic, links, images, lists (ordered
+ * and unordered, including nested), tables (GFM pipe tables), code blocks,
+ * inline code, blockquotes, horizontal rules, and definition lists.
+ * <p>
+ * Content within &lt;script&gt; and &lt;style&gt; tags is ignored.
+ * </p>
+ *
+ * @since Apache Tika 3.2
+ */
+public class ToMarkdownContentHandler extends DefaultHandler {
+
+    private static final String STYLE = "STYLE";
+    private static final String SCRIPT = "SCRIPT";
+
+    private final Writer writer;
+
+    private final Deque<String> elementStack = new ArrayDeque<>();
+    private final Deque<ListState> listStack = new ArrayDeque<>();
+
+    // Link buffering
+    private StringBuilder linkText;
+    private String linkHref;
+
+    // Table buffering (only the outermost table is rendered; nested tables 
are ignored)
+    private int tableDepth = 0;
+    private List<List<String>> tableRows;
+    private List<String> currentRow;
+    private StringBuilder currentCell;
+
+    // Blockquote
+    private int blockquoteDepth = 0;
+
+    // Code
+    private boolean inPreBlock = false;
+    private boolean inInlineCode = false;
+
+    // Script/style suppression
+    private int scriptDepth = 0;
+    private int styleDepth = 0;
+
+    // Spacing
+    private boolean needsBlockSeparator = false;
+    private boolean atLineStart = true;
+
+    // Track if we've written any content at all
+    private boolean hasContent = false;
+
+    public ToMarkdownContentHandler(Writer writer) {
+        this.writer = writer;
+    }
+
+    public ToMarkdownContentHandler(OutputStream stream, String encoding)
+            throws UnsupportedEncodingException {
+        this(new OutputStreamWriter(stream, encoding));
+    }
+
+    public ToMarkdownContentHandler() {
+        this(new StringWriter());
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, 
Attributes atts)
+            throws SAXException {
+        String name = localName(localName, qName);
+
+        // Track script/style depth
+        if (name.equals("script")) {
+            scriptDepth++;
+            elementStack.push(name);
+            return;
+        }
+        if (name.equals("style")) {
+            styleDepth++;
+            elementStack.push(name);
+            return;
+        }
+
+        if (scriptDepth > 0 || styleDepth > 0) {
+            elementStack.push(name);
+            return;
+        }
+
+        elementStack.push(name);
+
+        switch (name) {
+            case "h1":
+            case "h2":
+            case "h3":
+            case "h4":
+            case "h5":
+            case "h6":
+                emitBlockSeparator();
+                int level = name.charAt(1) - '0';
+                write(repeatChar('#', level) + " ");
+                break;
+            case "p":
+                emitBlockSeparator();
+                break;
+            case "b":
+            case "strong":
+                write("**");
+                break;
+            case "i":
+            case "em":
+                write("*");
+                break;
+            case "a":
+                linkHref = atts.getValue("href");
+                linkText = new StringBuilder();
+                break;
+            case "img":
+                String alt = atts.getValue("alt");
+                String src = atts.getValue("src");
+                write("![" + (alt != null ? alt : "") + "](" + (src != null ? 
src : "") + ")");
+                break;
+            case "ul":
+            case "ol":
+                if (!listStack.isEmpty()) {
+                    // nested list — no extra block separator
+                } else {
+                    emitBlockSeparator();
+                }
+                listStack.push(new ListState(name.equals("ol"), 
listStack.size()));
+                break;
+            case "li":
+                if (!listStack.isEmpty()) {
+                    ListState state = listStack.peek();
+                    String indent = repeatChar(' ', state.depth * 4);
+                    if (state.ordered) {
+                        state.counter++;
+                        write(indent + state.counter + ". ");
+                    } else {
+                        write(indent + "- ");
+                    }
+                }
+                break;
+            case "blockquote":
+                emitBlockSeparator();
+                blockquoteDepth++;
+                break;
+            case "pre":
+                emitBlockSeparator();
+                inPreBlock = true;
+                write("```\n");
+                break;
+            case "code":
+                if (!inPreBlock) {
+                    inInlineCode = true;
+                    write("`");
+                }
+                break;
+            case "br":
+                write("\n");
+                atLineStart = true;
+                break;
+            case "hr":
+                emitBlockSeparator();
+                write("---");
+                needsBlockSeparator = true;
+                hasContent = true;
+                break;
+            case "table":
+                tableDepth++;
+                if (tableDepth == 1) {
+                    emitBlockSeparator();
+                    tableRows = new ArrayList<>();
+                }
+                break;
+            case "tr":
+                if (tableDepth == 1 && tableRows != null) {
+                    currentRow = new ArrayList<>();
+                }
+                break;
+            case "th":
+                if (tableDepth == 1 && currentRow != null) {
+                    currentCell = new StringBuilder();
+                }
+                break;
+            case "td":
+                if (tableDepth == 1 && currentRow != null) {
+                    currentCell = new StringBuilder();
+                }
+                break;
+            case "dt":
+                emitBlockSeparator();
+                write("**");
+                break;
+            case "dd":
+                write("\n: ");
+                break;
+            case "div":
+                emitBlockSeparator();
+                break;
+            default:
+                // Ignore structural elements like html, head, body, title, 
meta
+                break;
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws 
SAXException {
+        String name = localName(localName, qName);
+
+        if (!elementStack.isEmpty()) {
+            elementStack.pop();
+        }
+
+        // Track script/style depth
+        if (name.equals("script")) {
+            scriptDepth--;
+            return;
+        }
+        if (name.equals("style")) {
+            styleDepth--;
+            return;
+        }
+
+        if (scriptDepth > 0 || styleDepth > 0) {
+            return;
+        }
+
+        switch (name) {
+            case "h1":
+            case "h2":
+            case "h3":
+            case "h4":
+            case "h5":
+            case "h6":
+                needsBlockSeparator = true;
+                hasContent = true;
+                break;
+            case "p":
+                needsBlockSeparator = true;
+                hasContent = true;
+                break;
+            case "b":
+            case "strong":
+                write("**");
+                break;
+            case "i":
+            case "em":
+                write("*");
+                break;
+            case "a":
+                if (linkText != null) {
+                    String text = linkText.toString();
+                    String href = linkHref != null ? linkHref : "";
+                    write("[" + text + "](" + href + ")");
+                    linkText = null;
+                    linkHref = null;
+                }
+                break;
+            case "ul":
+            case "ol":
+                if (!listStack.isEmpty()) {
+                    listStack.pop();
+                }
+                if (listStack.isEmpty()) {
+                    needsBlockSeparator = true;
+                    hasContent = true;
+                }
+                break;
+            case "li":
+                write("\n");
+                atLineStart = true;
+                break;
+            case "blockquote":
+                blockquoteDepth--;
+                needsBlockSeparator = true;
+                hasContent = true;
+                break;
+            case "pre":
+                if (!endsWithNewline()) {
+                    write("\n");
+                }
+                write("```");
+                inPreBlock = false;
+                needsBlockSeparator = true;
+                hasContent = true;
+                break;
+            case "code":
+                if (!inPreBlock) {
+                    inInlineCode = false;
+                    write("`");
+                }
+                break;
+            case "table":
+                if (tableDepth == 1) {
+                    emitTable();
+                    tableRows = null;
+                    currentRow = null;
+                    currentCell = null;
+                    needsBlockSeparator = true;
+                    hasContent = true;
+                }
+                tableDepth = Math.max(0, tableDepth - 1);
+                break;
+            case "tr":
+                if (tableDepth == 1 && tableRows != null && currentRow != 
null) {
+                    tableRows.add(currentRow);
+                    currentRow = null;
+                }
+                break;
+            case "th":
+            case "td":
+                if (tableDepth == 1 && currentRow != null && currentCell != 
null) {
+                    currentRow.add(currentCell.toString().trim());
+                    currentCell = null;
+                }
+                break;
+            case "dt":
+                write("**");
+                break;
+            case "dd":
+                needsBlockSeparator = true;
+                hasContent = true;
+                break;
+            case "div":
+                needsBlockSeparator = true;
+                hasContent = true;
+                break;
+            default:
+                break;
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws 
SAXException {
+        if (scriptDepth > 0 || styleDepth > 0) {
+            return;
+        }
+
+        // Buffer into link text
+        if (linkText != null) {
+            linkText.append(ch, start, length);
+            return;
+        }
+
+        // Buffer into table cell
+        if (currentCell != null) {
+            currentCell.append(ch, start, length);
+            return;
+        }
+
+        String text = new String(ch, start, length);
+
+        // In pre blocks, write raw (no escaping)
+        if (inPreBlock) {
+            write(text);
+            return;
+        }
+
+        // In inline code, write raw (no escaping)
+        if (inInlineCode) {
+            write(text);
+            return;
+        }
+
+        // Escape markdown special characters in normal text
+        text = escapeMarkdown(text);
+
+        // Add blockquote prefix if needed at line start
+        if (blockquoteDepth > 0 && atLineStart && !text.isEmpty()) {
+            write(repeatChar('>', blockquoteDepth) + " ");
+            atLineStart = false;
+        }
+
+        if (!text.isEmpty()) {
+            write(text);
+            hasContent = true;
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws 
SAXException {
+        characters(ch, start, length);
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        try {
+            writer.flush();
+        } catch (IOException e) {
+            throw new SAXException("Error flushing character output", e);
+        }
+    }
+
+    @Override
+    public String toString() {
+        return writer.toString();
+    }
+
+    private void write(String s) throws SAXException {
+        try {
+            writer.write(s);
+            if (!s.isEmpty()) {
+                atLineStart = s.charAt(s.length() - 1) == '\n';
+            }
+        } catch (IOException e) {
+            throw new SAXException("Error writing: " + s, e);
+        }
+    }
+
+    private void emitBlockSeparator() throws SAXException {
+        if (needsBlockSeparator && hasContent) {
+            write("\n\n");
+            needsBlockSeparator = false;
+            atLineStart = true;
+        }
+    }
+
+    private void emitTable() throws SAXException {
+        if (tableRows == null || tableRows.isEmpty()) {
+            return;
+        }
+
+        // Determine column count
+        int cols = 0;
+        for (List<String> row : tableRows) {
+            cols = Math.max(cols, row.size());
+        }
+
+        // Emit rows
+        for (int r = 0; r < tableRows.size(); r++) {
+            List<String> row = tableRows.get(r);
+            StringBuilder sb = new StringBuilder("|");
+            for (int c = 0; c < cols; c++) {
+                String cell = c < row.size() ? row.get(c) : "";
+                sb.append(" ").append(cell).append(" |");
+            }
+            write(sb.toString());
+            write("\n");
+
+            // Insert separator after first row
+            if (r == 0) {
+                StringBuilder sep = new StringBuilder("|");
+                for (int c = 0; c < cols; c++) {
+                    sep.append(" --- |");
+                }
+                write(sep.toString());
+                write("\n");
+            }
+        }
+    }
+
+    private boolean endsWithNewline() {
+        String s = writer.toString();
+        return !s.isEmpty() && s.charAt(s.length() - 1) == '\n';
+    }
+
+    private static String escapeMarkdown(String text) {
+        StringBuilder sb = new StringBuilder(text.length());
+        for (int i = 0; i < text.length(); i++) {
+            char c = text.charAt(i);
+            switch (c) {
+                case '\\':
+                case '`':
+                case '*':
+                case '_':
+                case '[':
+                case ']':
+                case '#':
+                case '|':
+                    sb.append('\\').append(c);
+                    break;
+                default:
+                    sb.append(c);
+                    break;
+            }
+        }
+        return sb.toString();
+    }
+
+    private static String repeatChar(char c, int count) {
+        StringBuilder sb = new StringBuilder(count);
+        for (int i = 0; i < count; i++) {
+            sb.append(c);
+        }
+        return sb.toString();
+    }
+
+    private static String localName(String localName, String qName) {
+        if (localName != null && !localName.isEmpty()) {
+            return localName.toLowerCase(Locale.ROOT);
+        }
+        if (qName != null) {
+            // Strip namespace prefix
+            int colon = qName.indexOf(':');
+            String name = colon >= 0 ? qName.substring(colon + 1) : qName;
+            return name.toLowerCase(Locale.ROOT);
+        }
+        return "";
+    }
+
+    private static class ListState {
+        final boolean ordered;
+        final int depth;
+        int counter;
+
+        ListState(boolean ordered, int depth) {
+            this.ordered = ordered;
+            this.depth = depth;
+            this.counter = 0;
+        }
+    }
+}
diff --git 
a/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java 
b/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java
new file mode 100644
index 0000000000..1ba3523a23
--- /dev/null
+++ 
b/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java
@@ -0,0 +1,941 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Random;
+
+import org.junit.jupiter.api.RepeatedTest;
+import org.junit.jupiter.api.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Test cases for the {@link ToMarkdownContentHandler} class.
+ */
+public class ToMarkdownContentHandlerTest {
+
+    private static final String XHTML = "http://www.w3.org/1999/xhtml";;
+    private static final Attributes EMPTY = new AttributesImpl();
+
+    private static void startElement(ContentHandler handler, String name) 
throws Exception {
+        handler.startElement(XHTML, name, name, EMPTY);
+    }
+
+    private static void startElement(ContentHandler handler, String name, 
String attrName,
+                                     String attrValue) throws Exception {
+        AttributesImpl atts = new AttributesImpl();
+        atts.addAttribute("", attrName, attrName, "CDATA", attrValue);
+        handler.startElement(XHTML, name, name, atts);
+    }
+
+    private static void startElement(ContentHandler handler, String name, 
AttributesImpl atts)
+            throws Exception {
+        handler.startElement(XHTML, name, name, atts);
+    }
+
+    private static void endElement(ContentHandler handler, String name) throws 
Exception {
+        handler.endElement(XHTML, name, name);
+    }
+
+    private static void chars(ContentHandler handler, String text) throws 
Exception {
+        char[] ch = text.toCharArray();
+        handler.characters(ch, 0, ch.length);
+    }
+
+    @Test
+    public void testHeadings() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "h1");
+        chars(handler, "Title");
+        endElement(handler, "h1");
+
+        startElement(handler, "h2");
+        chars(handler, "Subtitle");
+        endElement(handler, "h2");
+
+        startElement(handler, "h3");
+        chars(handler, "Section");
+        endElement(handler, "h3");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("# Title"));
+        assertTrue(result.contains("## Subtitle"));
+        assertTrue(result.contains("### Section"));
+    }
+
+    @Test
+    public void testAllHeadingLevels() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        for (int i = 1; i <= 6; i++) {
+            startElement(handler, "h" + i);
+            chars(handler, "H" + i);
+            endElement(handler, "h" + i);
+        }
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("# H1"));
+        assertTrue(result.contains("## H2"));
+        assertTrue(result.contains("### H3"));
+        assertTrue(result.contains("#### H4"));
+        assertTrue(result.contains("##### H5"));
+        assertTrue(result.contains("###### H6"));
+    }
+
+    @Test
+    public void testParagraphs() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        chars(handler, "First paragraph.");
+        endElement(handler, "p");
+
+        startElement(handler, "p");
+        chars(handler, "Second paragraph.");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("First paragraph."));
+        assertTrue(result.contains("Second paragraph."));
+        // Paragraphs should be separated by blank line
+        assertTrue(result.contains("First paragraph.\n\nSecond paragraph."));
+    }
+
+    @Test
+    public void testBold() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        chars(handler, "This is ");
+        startElement(handler, "b");
+        chars(handler, "bold");
+        endElement(handler, "b");
+        chars(handler, " text.");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("**bold**"));
+    }
+
+    @Test
+    public void testStrong() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        startElement(handler, "strong");
+        chars(handler, "strong");
+        endElement(handler, "strong");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        assertTrue(handler.toString().contains("**strong**"));
+    }
+
+    @Test
+    public void testItalic() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        chars(handler, "This is ");
+        startElement(handler, "i");
+        chars(handler, "italic");
+        endElement(handler, "i");
+        chars(handler, " text.");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        assertTrue(handler.toString().contains("*italic*"));
+    }
+
+    @Test
+    public void testEmphasis() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        startElement(handler, "em");
+        chars(handler, "emphasized");
+        endElement(handler, "em");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        assertTrue(handler.toString().contains("*emphasized*"));
+    }
+
+    @Test
+    public void testLink() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        chars(handler, "Click ");
+        startElement(handler, "a", "href", "https://example.com";);
+        chars(handler, "here");
+        endElement(handler, "a");
+        chars(handler, " for more.");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        assertTrue(handler.toString().contains("[here](https://example.com)"));
+    }
+
+    @Test
+    public void testImage() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        AttributesImpl atts = new AttributesImpl();
+        atts.addAttribute("", "alt", "alt", "CDATA", "A photo");
+        atts.addAttribute("", "src", "src", "CDATA", "photo.jpg");
+        startElement(handler, "img", atts);
+        endElement(handler, "img");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        assertTrue(handler.toString().contains("![A photo](photo.jpg)"));
+    }
+
+    @Test
+    public void testUnorderedList() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "ul");
+        startElement(handler, "li");
+        chars(handler, "Apple");
+        endElement(handler, "li");
+        startElement(handler, "li");
+        chars(handler, "Banana");
+        endElement(handler, "li");
+        startElement(handler, "li");
+        chars(handler, "Cherry");
+        endElement(handler, "li");
+        endElement(handler, "ul");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("- Apple"));
+        assertTrue(result.contains("- Banana"));
+        assertTrue(result.contains("- Cherry"));
+    }
+
+    @Test
+    public void testOrderedList() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "ol");
+        startElement(handler, "li");
+        chars(handler, "First");
+        endElement(handler, "li");
+        startElement(handler, "li");
+        chars(handler, "Second");
+        endElement(handler, "li");
+        startElement(handler, "li");
+        chars(handler, "Third");
+        endElement(handler, "li");
+        endElement(handler, "ol");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("1. First"));
+        assertTrue(result.contains("2. Second"));
+        assertTrue(result.contains("3. Third"));
+    }
+
+    @Test
+    public void testNestedLists() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "ul");
+        startElement(handler, "li");
+        chars(handler, "Fruit");
+
+        startElement(handler, "ul");
+        startElement(handler, "li");
+        chars(handler, "Apple");
+        endElement(handler, "li");
+        startElement(handler, "li");
+        chars(handler, "Banana");
+        endElement(handler, "li");
+        endElement(handler, "ul");
+
+        endElement(handler, "li");
+        startElement(handler, "li");
+        chars(handler, "Vegetable");
+        endElement(handler, "li");
+        endElement(handler, "ul");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("- Fruit"));
+        assertTrue(result.contains("    - Apple"));
+        assertTrue(result.contains("    - Banana"));
+        assertTrue(result.contains("- Vegetable"));
+    }
+
+    @Test
+    public void testTable() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "table");
+
+        // Header row
+        startElement(handler, "tr");
+        startElement(handler, "th");
+        chars(handler, "Name");
+        endElement(handler, "th");
+        startElement(handler, "th");
+        chars(handler, "Age");
+        endElement(handler, "th");
+        endElement(handler, "tr");
+
+        // Data row
+        startElement(handler, "tr");
+        startElement(handler, "td");
+        chars(handler, "Alice");
+        endElement(handler, "td");
+        startElement(handler, "td");
+        chars(handler, "30");
+        endElement(handler, "td");
+        endElement(handler, "tr");
+
+        endElement(handler, "table");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("| Name | Age |"));
+        assertTrue(result.contains("| --- | --- |"));
+        assertTrue(result.contains("| Alice | 30 |"));
+    }
+
+    @Test
+    public void testFencedCodeBlock() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "pre");
+        startElement(handler, "code");
+        chars(handler, "int x = 42;");
+        endElement(handler, "code");
+        endElement(handler, "pre");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("```\n"));
+        assertTrue(result.contains("int x = 42;"));
+        assertTrue(result.contains("\n```"));
+    }
+
+    @Test
+    public void testInlineCode() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        chars(handler, "Use the ");
+        startElement(handler, "code");
+        chars(handler, "println");
+        endElement(handler, "code");
+        chars(handler, " function.");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        assertTrue(handler.toString().contains("`println`"));
+    }
+
+    @Test
+    public void testBlockquote() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "blockquote");
+        chars(handler, "To be or not to be.");
+        endElement(handler, "blockquote");
+
+        handler.endDocument();
+
+        assertTrue(handler.toString().contains("> To be or not to be."));
+    }
+
+    @Test
+    public void testHorizontalRule() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        chars(handler, "Above");
+        endElement(handler, "p");
+
+        startElement(handler, "hr");
+        endElement(handler, "hr");
+
+        startElement(handler, "p");
+        chars(handler, "Below");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("---"));
+        assertTrue(result.contains("Above"));
+        assertTrue(result.contains("Below"));
+    }
+
+    @Test
+    public void testLineBreak() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        chars(handler, "Line one");
+        startElement(handler, "br");
+        endElement(handler, "br");
+        chars(handler, "Line two");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        assertTrue(handler.toString().contains("Line one\nLine two"));
+    }
+
+    @Test
+    public void testBoldInsideListItem() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "ul");
+        startElement(handler, "li");
+        startElement(handler, "b");
+        chars(handler, "Important");
+        endElement(handler, "b");
+        chars(handler, " item");
+        endElement(handler, "li");
+        endElement(handler, "ul");
+
+        handler.endDocument();
+
+        assertTrue(handler.toString().contains("- **Important** item"));
+    }
+
+    @Test
+    public void testLinkInsideHeading() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "h2");
+        startElement(handler, "a", "href", "https://example.com";);
+        chars(handler, "Linked Title");
+        endElement(handler, "a");
+        endElement(handler, "h2");
+
+        handler.endDocument();
+
+        assertTrue(handler.toString().contains("## [Linked 
Title](https://example.com)"));
+    }
+
+    @Test
+    public void testScriptContentSkipped() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        chars(handler, "Before");
+        endElement(handler, "p");
+
+        startElement(handler, "script");
+        chars(handler, "alert('xss');");
+        endElement(handler, "script");
+
+        startElement(handler, "p");
+        chars(handler, "After");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("Before"));
+        assertTrue(result.contains("After"));
+        assertFalse(result.contains("alert"));
+    }
+
+    @Test
+    public void testStyleContentSkipped() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        chars(handler, "Visible");
+        endElement(handler, "p");
+
+        startElement(handler, "style");
+        chars(handler, "body { color: red; }");
+        endElement(handler, "style");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("Visible"));
+        assertFalse(result.contains("color"));
+    }
+
+    @Test
+    public void testMarkdownEscaping() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        chars(handler, "Special chars: * _ [ ] # | \\ `");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("\\*"));
+        assertTrue(result.contains("\\_"));
+        assertTrue(result.contains("\\["));
+        assertTrue(result.contains("\\]"));
+        assertTrue(result.contains("\\#"));
+        assertTrue(result.contains("\\|"));
+        assertTrue(result.contains("\\\\"));
+        assertTrue(result.contains("\\`"));
+    }
+
+    @Test
+    public void testNoEscapingInCodeBlock() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "pre");
+        startElement(handler, "code");
+        chars(handler, "x * y = z");
+        endElement(handler, "code");
+        endElement(handler, "pre");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        // Inside code blocks, * should NOT be escaped
+        assertTrue(result.contains("x * y = z"));
+        assertFalse(result.contains("\\*"));
+    }
+
+    @Test
+    public void testNoEscapingInInlineCode() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        startElement(handler, "code");
+        chars(handler, "a*b");
+        endElement(handler, "code");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("`a*b`"));
+    }
+
+    @Test
+    public void testDefinitionList() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "dl");
+        startElement(handler, "dt");
+        chars(handler, "Term");
+        endElement(handler, "dt");
+        startElement(handler, "dd");
+        chars(handler, "Definition of the term");
+        endElement(handler, "dd");
+        endElement(handler, "dl");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("**Term**"));
+        assertTrue(result.contains(": Definition of the term"));
+    }
+
+    @Test
+    public void testDiv() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "div");
+        chars(handler, "Content in div");
+        endElement(handler, "div");
+
+        startElement(handler, "div");
+        chars(handler, "Another div");
+        endElement(handler, "div");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("Content in div"));
+        assertTrue(result.contains("Another div"));
+        // Divs should be separated
+        assertTrue(result.contains("Content in div\n\nAnother div"));
+    }
+
+    @Test
+    public void testHandlerTypeParsingMarkdown() {
+        assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,
+                BasicContentHandlerFactory.parseHandlerType("markdown",
+                        BasicContentHandlerFactory.HANDLER_TYPE.TEXT));
+        assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,
+                BasicContentHandlerFactory.parseHandlerType("md",
+                        BasicContentHandlerFactory.HANDLER_TYPE.TEXT));
+        assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,
+                BasicContentHandlerFactory.parseHandlerType("MARKDOWN",
+                        BasicContentHandlerFactory.HANDLER_TYPE.TEXT));
+        assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,
+                BasicContentHandlerFactory.parseHandlerType("MD",
+                        BasicContentHandlerFactory.HANDLER_TYPE.TEXT));
+    }
+
+    @Test
+    public void testFactoryCreatesMarkdownHandler() {
+        BasicContentHandlerFactory factory =
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, 
-1);
+        org.xml.sax.ContentHandler handler = factory.createHandler();
+        assertTrue(handler instanceof ToMarkdownContentHandler);
+    }
+
+    @Test
+    public void testTableWithOnlyTd() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "table");
+
+        startElement(handler, "tr");
+        startElement(handler, "td");
+        chars(handler, "A");
+        endElement(handler, "td");
+        startElement(handler, "td");
+        chars(handler, "B");
+        endElement(handler, "td");
+        endElement(handler, "tr");
+
+        startElement(handler, "tr");
+        startElement(handler, "td");
+        chars(handler, "C");
+        endElement(handler, "td");
+        startElement(handler, "td");
+        chars(handler, "D");
+        endElement(handler, "td");
+        endElement(handler, "tr");
+
+        endElement(handler, "table");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        assertTrue(result.contains("| A | B |"));
+        assertTrue(result.contains("| --- | --- |"));
+        assertTrue(result.contains("| C | D |"));
+    }
+
+    @Test
+    public void testNestedTablesIgnored() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "table");
+
+        // Outer header row
+        startElement(handler, "tr");
+        startElement(handler, "th");
+        chars(handler, "Outer1");
+        endElement(handler, "th");
+        startElement(handler, "th");
+        chars(handler, "Outer2");
+        endElement(handler, "th");
+        endElement(handler, "tr");
+
+        // Outer data row with nested table in second cell
+        startElement(handler, "tr");
+        startElement(handler, "td");
+        chars(handler, "A");
+        endElement(handler, "td");
+        startElement(handler, "td");
+        chars(handler, "B");
+
+        // Nested table -- should be ignored
+        startElement(handler, "table");
+        startElement(handler, "tr");
+        startElement(handler, "td");
+        chars(handler, "Inner");
+        endElement(handler, "td");
+        endElement(handler, "tr");
+        endElement(handler, "table");
+
+        endElement(handler, "td");
+        endElement(handler, "tr");
+
+        endElement(handler, "table");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        // Outer table should be rendered
+        assertTrue(result.contains("| Outer1 | Outer2 |"));
+        assertTrue(result.contains("| --- | --- |"));
+        // Inner cell text gets folded into the outer cell ("B" + "Inner" = 
"BInner")
+        assertTrue(result.contains("| A | BInner |"));
+        // Inner table structure should not appear as a separate table
+        assertFalse(result.contains("| Inner |"));
+    }
+
+    private static final String[] ALL_ELEMENTS = {
+            "h1", "h2", "h3", "h4", "h5", "h6",
+            "p", "div", "span",
+            "b", "strong", "i", "em",
+            "a", "img",
+            "ul", "ol", "li",
+            "table", "tr", "th", "td",
+            "blockquote", "pre", "code",
+            "br", "hr",
+            "dl", "dt", "dd",
+            "script", "style",
+            "html", "head", "body", "title", "meta"
+    };
+
+    /**
+     * Randomized test: fire random sequences of 
startElement/endElement/characters
+     * events with no guarantee of proper nesting. The handler must not throw 
any
+     * runtime exceptions (e.g., EmptyStackException, NullPointerException,
+     * IndexOutOfBoundsException).
+     */
+    @RepeatedTest(20)
+    public void testRandomUnbalancedTags() throws Exception {
+        Random rng = new Random();
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+
+        assertDoesNotThrow(() -> {
+            handler.startDocument();
+
+            int numEvents = 50 + rng.nextInt(150);
+            for (int i = 0; i < numEvents; i++) {
+                int action = rng.nextInt(4);
+                String elem = ALL_ELEMENTS[rng.nextInt(ALL_ELEMENTS.length)];
+                switch (action) {
+                    case 0:
+                        // start element (possibly with attributes)
+                        if (elem.equals("a")) {
+                            startElement(handler, elem, "href", 
"http://example.com";);
+                        } else if (elem.equals("img")) {
+                            AttributesImpl atts = new AttributesImpl();
+                            atts.addAttribute("", "src", "src", "CDATA", 
"img.png");
+                            atts.addAttribute("", "alt", "alt", "CDATA", "alt 
text");
+                            startElement(handler, elem, atts);
+                        } else {
+                            startElement(handler, elem);
+                        }
+                        break;
+                    case 1:
+                        // end element (possibly unmatched)
+                        endElement(handler, elem);
+                        break;
+                    case 2:
+                        // characters
+                        chars(handler, "text_" + i);
+                        break;
+                    case 3:
+                        // ignorable whitespace
+                        char[] ws = "  \t\n".toCharArray();
+                        handler.ignorableWhitespace(ws, 0, ws.length);
+                        break;
+                }
+            }
+
+            handler.endDocument();
+        });
+
+        // Just verify we can get the output without error
+        assertDoesNotThrow(() -> handler.toString());
+    }
+
+    /**
+     * Test extra endElement calls with no matching start -- should not throw.
+     */
+    @Test
+    public void testExtraEndElements() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+
+        assertDoesNotThrow(() -> {
+            handler.startDocument();
+
+            // End elements with no matching starts
+            endElement(handler, "p");
+            endElement(handler, "table");
+            endElement(handler, "tr");
+            endElement(handler, "td");
+            endElement(handler, "ul");
+            endElement(handler, "li");
+            endElement(handler, "a");
+            endElement(handler, "pre");
+            endElement(handler, "code");
+            endElement(handler, "blockquote");
+            endElement(handler, "b");
+            endElement(handler, "i");
+            endElement(handler, "script");
+            endElement(handler, "style");
+
+            handler.endDocument();
+        });
+    }
+
+    /**
+     * Test start elements with no matching end -- should not throw.
+     */
+    @Test
+    public void testUnclosedElements() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+
+        assertDoesNotThrow(() -> {
+            handler.startDocument();
+
+            startElement(handler, "p");
+            chars(handler, "unclosed paragraph");
+            startElement(handler, "b");
+            chars(handler, "unclosed bold");
+            startElement(handler, "a", "href", "http://example.com";);
+            chars(handler, "unclosed link");
+            startElement(handler, "ul");
+            startElement(handler, "li");
+            chars(handler, "unclosed list item");
+            startElement(handler, "table");
+            startElement(handler, "tr");
+            startElement(handler, "td");
+            chars(handler, "unclosed cell");
+            startElement(handler, "blockquote");
+            chars(handler, "unclosed quote");
+            startElement(handler, "pre");
+            chars(handler, "unclosed pre");
+
+            handler.endDocument();
+        });
+    }
+
+    /**
+     * Test deeply nested elements of the same type -- should not throw.
+     */
+    @Test
+    public void testDeeplyNestedSameElement() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+
+        assertDoesNotThrow(() -> {
+            handler.startDocument();
+
+            // Deeply nested lists
+            for (int i = 0; i < 50; i++) {
+                startElement(handler, "ul");
+                startElement(handler, "li");
+                chars(handler, "level " + i);
+            }
+            for (int i = 0; i < 50; i++) {
+                endElement(handler, "li");
+                endElement(handler, "ul");
+            }
+
+            // Deeply nested blockquotes
+            for (int i = 0; i < 20; i++) {
+                startElement(handler, "blockquote");
+            }
+            chars(handler, "deep quote");
+            for (int i = 0; i < 20; i++) {
+                endElement(handler, "blockquote");
+            }
+
+            handler.endDocument();
+        });
+    }
+
+    /**
+     * Test interleaved (improperly nested) elements -- should not throw.
+     */
+    @Test
+    public void testInterleavedElements() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+
+        assertDoesNotThrow(() -> {
+            handler.startDocument();
+
+            // <b><i>text</b></i> -- improper nesting
+            startElement(handler, "b");
+            startElement(handler, "i");
+            chars(handler, "interleaved");
+            endElement(handler, "b");
+            endElement(handler, "i");
+
+            // <table><p>text</table></p>
+            startElement(handler, "table");
+            startElement(handler, "p");
+            chars(handler, "table with p");
+            endElement(handler, "table");
+            endElement(handler, "p");
+
+            // <ul><h1>text</ul></h1>
+            startElement(handler, "ul");
+            startElement(handler, "h1");
+            chars(handler, "list with heading");
+            endElement(handler, "ul");
+            endElement(handler, "h1");
+
+            handler.endDocument();
+        });
+    }
+}
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index 7fbf7f2566..a129e47f30 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -106,12 +106,13 @@ public class RecursiveMetadataResource {
      * The extracted text content is stored with the key
      * {@link org.apache.tika.metadata.TikaCoreProperties#TIKA_CONTENT}.
      * <p>
-     * Specify the handler for the content (xml, html, text, ignore)
+     * Specify the handler for the content (xml, html, text, markdown, ignore)
      * in the path:<br/>
      * /rmeta/form (default: xml)<br/>
-     * /rmeta/form/xml    (store the content as xml)<br/>
-     * /rmeta/form/text   (store the content as text)<br/>
-     * /rmeta/form/ignore (don't record any content)<br/>
+     * /rmeta/form/xml      (store the content as xml)<br/>
+     * /rmeta/form/text     (store the content as text)<br/>
+     * /rmeta/form/markdown (store the content as markdown)<br/>
+     * /rmeta/form/ignore   (don't record any content)<br/>
      *
      * @param att             attachment
      * @param info            uri info
@@ -182,12 +183,13 @@ public class RecursiveMetadataResource {
      * The extracted text content is stored with the key
      * {@link org.apache.tika.metadata.TikaCoreProperties#TIKA_CONTENT}.
      * <p>
-     * Specify the handler for the content (xml, html, text, ignore)
+     * Specify the handler for the content (xml, html, text, markdown, ignore)
      * in the path:<br/>
      * /rmeta (default: xml)<br/>
-     * /rmeta/xml    (store the content as xml)<br/>
-     * /rmeta/text   (store the content as text)<br/>
-     * /rmeta/ignore (don't record any content)<br/>
+     * /rmeta/xml      (store the content as xml)<br/>
+     * /rmeta/text     (store the content as text)<br/>
+     * /rmeta/markdown (store the content as markdown)<br/>
+     * /rmeta/ignore   (don't record any content)<br/>
      *
      * @param handlerTypeName which type of handler to use
      * @return InputStream that can be deserialized as a list of {@link 
Metadata} objects
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 5819ccb551..354331ce38 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -76,7 +76,7 @@ public class TikaResource {
     public static final String GREETING = "This is Tika Server (" + 
Tika.getString() + "). Please PUT\n";
     /**
      * Header to specify the handler type for content extraction.
-     * Valid values: text, html, xml, ignore (default: text)
+     * Valid values: text, html, xml, markdown, ignore (default: text)
      */
     public static final String HANDLER_TYPE_HEADER = "X-Tika-Handler";
     private static final String META_PREFIX = "meta_";
@@ -549,6 +549,21 @@ public class TikaResource {
         return produceRawOutput(tis, Metadata.newInstance(context), 
httpHeaders.getRequestHeaders(), "xml");
     }
 
+    /**
+     * Parse document and return Markdown content.
+     */
+    @PUT
+    @Consumes("*/*")
+    @Produces("text/plain")
+    @Path("md")
+    public StreamingOutput getMarkdown(final InputStream is, @Context 
HttpHeaders httpHeaders)
+            throws IOException {
+        TikaInputStream tis = TikaInputStream.get(is);
+        tis.getPath(); // Spool to temp file for pipes-based parsing
+        ParseContext context = createParseContext();
+        return produceRawOutput(tis, Metadata.newInstance(context), 
httpHeaders.getRequestHeaders(), "md");
+    }
+
     /**
      * Parse document and return JSON with metadata and text content.
      */
@@ -675,6 +690,28 @@ public class TikaResource {
         return produceRawOutput(tis, metadata, context, "xml");
     }
 
+    /**
+     * Parse multipart document with optional config, return Markdown.
+     * <p>
+     * Accepts multipart with:
+     * - "file" part (required): the document to parse
+     * - "config" part (optional): JSON configuration for parser settings
+     * <p>
+     * This endpoint is gated behind enableUnsecureFeatures=true because 
per-request
+     * configuration could enable dangerous operations.
+     */
+    @POST
+    @Consumes("multipart/form-data")
+    @Produces("text/plain")
+    @Path("config/md")
+    public StreamingOutput postMarkdown(List<Attachment> attachments, @Context 
HttpHeaders httpHeaders)
+            throws IOException {
+        ParseContext context = createParseContext();
+        Metadata metadata = Metadata.newInstance(context);
+        TikaInputStream tis = setupMultipartConfig(attachments, metadata, 
context);
+        return produceRawOutput(tis, metadata, context, "md");
+    }
+
     /**
      * Parse multipart document with optional config, return JSON.
      * <p>
@@ -702,7 +739,7 @@ public class TikaResource {
     // ==================== Internal methods ====================
 
     /**
-     * Produces raw streaming output (text, html, xml) using pipes-based 
parsing.
+     * Produces raw streaming output (text, html, xml, md) using pipes-based 
parsing.
      */
     private StreamingOutput produceRawOutput(TikaInputStream tis, Metadata 
metadata,
                                               MultivaluedMap<String, String> 
httpHeaders,
diff --git 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
index 38e51de0c0..a03d8055fa 100644
--- 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
+++ 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.server.standard;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -54,6 +55,7 @@ public class RecursiveMetadataResourceTest extends 
CXFTestBase {
     private static final String FORM_PATH = "/form";
     private static final String META_PATH = "/rmeta";
     private static final String TEXT_PATH = "/text";
+    private static final String MD_PATH = "/md";
     private static final String IGNORE_PATH = "/ignore";
     private static final String XML_PATH = "/xml";
     private static final String UNPARSEABLE_PATH = "/somethingOrOther";
@@ -327,6 +329,23 @@ public class RecursiveMetadataResourceTest extends 
CXFTestBase {
                 .get(6)
                 .get(TikaCoreProperties.TIKA_CONTENT));
 
+        //markdown
+        response = WebClient
+                .create(endPoint + META_PATH + MD_PATH)
+                .accept("application/json")
+                
.put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        reader = new InputStreamReader((InputStream) response.getEntity(), 
UTF_8);
+        metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        content = metadataList
+                .get(6)
+                .get(TikaCoreProperties.TIKA_CONTENT)
+                .trim();
+        // Markdown output should not contain HTML/XML tags
+        assertFalse(content.startsWith("<html"));
+        // Should contain the document text
+        assertContains("plundered our seas", content);
+
     }
 
     @Test
@@ -419,6 +438,25 @@ public class RecursiveMetadataResourceTest extends 
CXFTestBase {
         assertNull(metadataList
                 .get(6)
                 .get(TikaCoreProperties.TIKA_CONTENT));
+
+        //markdown
+        attachmentPart =
+                new Attachment("myworddocx", 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", 
ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + 
MD_PATH);
+
+        response = webClient
+                .type("multipart/form-data")
+                .accept("application/json")
+                .post(attachmentPart);
+        reader = new InputStreamReader((InputStream) response.getEntity(), 
UTF_8);
+        metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        content = metadataList
+                .get(6)
+                .get(TikaCoreProperties.TIKA_CONTENT)
+                .trim();
+        assertFalse(content.startsWith("<html"));
+        assertContains("plundered our seas", content);
     }
 
     @Test
diff --git 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
index 0b4fe94fb0..934bafc9e5 100644
--- 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
+++ 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
@@ -159,6 +159,20 @@ public class TikaResourceTest extends CXFTestBase {
         assertEquals(UNPROCESSEABLE, response.getStatus());
     }
 
+    @Test
+    public void testSimpleWordMarkdown() throws Exception {
+        Response response = WebClient
+                .create(endPoint + TIKA_PATH + "/md")
+                .type("application/msword")
+                .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
+        String responseMsg = getStringFromInputStream((InputStream) 
response.getEntity());
+        assertTrue(responseMsg.contains("test"));
+        // Should not contain HTML/XML tags
+        assertFalse(responseMsg.contains("<html"));
+        assertFalse(responseMsg.contains("<body"));
+        assertFalse(responseMsg.contains("<p>"));
+    }
+
     @Test
     public void testSimpleWordHTML() throws Exception {
         Response response = WebClient

(tika) 02/02: TIKA-4653 - add markdown contenthandler

Reply via email to