This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 68d9d336e5 TIKA-4653 - markdown for 3x (#2599)
68d9d336e5 is described below
commit 68d9d336e51c25ab114fd9df5fee76d7688130a0
Author: Tim Allison <[email protected]>
AuthorDate: Mon Feb 9 09:48:23 2026 -0500
TIKA-4653 - markdown for 3x (#2599)
---
CHANGES.txt | 6 +-
.../tika/sax/BasicContentHandlerFactory.java | 14 +-
.../apache/tika/sax/ToMarkdownContentHandler.java | 542 ++++++++++++
.../tika/sax/ToMarkdownContentHandlerTest.java | 941 +++++++++++++++++++++
.../core/resource/RecursiveMetadataResource.java | 6 +-
.../tika/server/core/resource/TikaResource.java | 43 +
.../standard/RecursiveMetadataResourceTest.java | 36 +
.../tika/server/standard/TikaResourceTest.java | 13 +
8 files changed, 1596 insertions(+), 5 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index f890bcfdef..5def269d65 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,7 +1,11 @@
Release 3.3.0 - ???
* Users need to add "allowAbsolutePaths=true" for the FileSystemFetcher to
fetch
- an absolute path (
+ an absolute path (TIKA-4649).
+
+ * Add a markdown option for content handlers (TIKA-4563).
+
+ * Improve zip parsing (TIKA-4650).
* Add detection of compressed bmp (TIKA-4511).
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 361b7817c7..835f6e91d1 100644
---
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -77,7 +77,7 @@ public class BasicContentHandlerFactory implements
ContentHandlerFactory, WriteL
* Tries to parse string into handler type. Returns default if string is
null or
* parse fails.
* <p/>
- * Options: xml, html, text, body, ignore (no content)
+ * Options: xml, html, text, body, ignore (no content), markdown/md
*
* @param handlerTypeName string to parse
* @param defaultType type to return if parse fails
@@ -102,6 +102,9 @@ public class BasicContentHandlerFactory implements
ContentHandlerFactory, WriteL
return HANDLER_TYPE.BODY;
case "ignore":
return HANDLER_TYPE.IGNORE;
+ case "markdown":
+ case "md":
+ return HANDLER_TYPE.MARKDOWN;
default:
return defaultType;
}
@@ -133,6 +136,8 @@ public class BasicContentHandlerFactory implements
ContentHandlerFactory, WriteL
return new ToHTMLContentHandler();
case XML:
return new ToXMLContentHandler();
+ case MARKDOWN:
+ return new ToMarkdownContentHandler();
default:
return new ToTextContentHandler();
}
@@ -160,6 +165,9 @@ public class BasicContentHandlerFactory implements
ContentHandlerFactory, WriteL
case XML:
return new WriteOutContentHandler(
new ToXMLContentHandler(os, charset.name()),
writeLimit);
+ case MARKDOWN:
+ return new WriteOutContentHandler(
+ new ToMarkdownContentHandler(os,
charset.name()), writeLimit);
default:
return new WriteOutContentHandler(
new ToTextContentHandler(os, charset.name()),
writeLimit);
@@ -174,6 +182,8 @@ public class BasicContentHandlerFactory implements
ContentHandlerFactory, WriteL
return new ToHTMLContentHandler(os, charset.name());
case XML:
return new ToXMLContentHandler(os, charset.name());
+ case MARKDOWN:
+ return new ToMarkdownContentHandler(os,
charset.name());
default:
return new ToTextContentHandler(os, charset.name());
@@ -196,7 +206,7 @@ public class BasicContentHandlerFactory implements
ContentHandlerFactory, WriteL
*/
public enum HANDLER_TYPE {
BODY, IGNORE, //don't store content
- TEXT, HTML, XML
+ TEXT, HTML, XML, MARKDOWN
}
public int getWriteLimit() {
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java
b/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java
new file mode 100644
index 0000000000..34e5e96cef
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.StringWriter;
+import java.io.UnsupportedEncodingException;
+import java.io.Writer;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Deque;
+import java.util.List;
+import java.util.Locale;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * SAX event handler that writes content as Markdown.
+ * Supports headings, paragraphs, bold, italic, links, images, lists (ordered
+ * and unordered, including nested), tables (GFM pipe tables), code blocks,
+ * inline code, blockquotes, horizontal rules, and definition lists.
+ * <p>
+ * Content within <script> and <style> tags is ignored.
+ * </p>
+ *
+ * @since Apache Tika 3.2
+ */
+public class ToMarkdownContentHandler extends DefaultHandler {
+
+ private static final String STYLE = "STYLE";
+ private static final String SCRIPT = "SCRIPT";
+
+ private final Writer writer;
+
+ private final Deque<String> elementStack = new ArrayDeque<>();
+ private final Deque<ListState> listStack = new ArrayDeque<>();
+
+ // Link buffering
+ private StringBuilder linkText;
+ private String linkHref;
+
+ // Table buffering (only the outermost table is rendered; nested tables
are ignored)
+ private int tableDepth = 0;
+ private List<List<String>> tableRows;
+ private List<String> currentRow;
+ private StringBuilder currentCell;
+
+ // Blockquote
+ private int blockquoteDepth = 0;
+
+ // Code
+ private boolean inPreBlock = false;
+ private boolean inInlineCode = false;
+
+ // Script/style suppression
+ private int scriptDepth = 0;
+ private int styleDepth = 0;
+
+ // Spacing
+ private boolean needsBlockSeparator = false;
+ private boolean atLineStart = true;
+
+ // Track if we've written any content at all
+ private boolean hasContent = false;
+
+ public ToMarkdownContentHandler(Writer writer) {
+ this.writer = writer;
+ }
+
+ public ToMarkdownContentHandler(OutputStream stream, String encoding)
+ throws UnsupportedEncodingException {
+ this(new OutputStreamWriter(stream, encoding));
+ }
+
+ public ToMarkdownContentHandler() {
+ this(new StringWriter());
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes atts)
+ throws SAXException {
+ String name = localName(localName, qName);
+
+ // Track script/style depth
+ if (name.equals("script")) {
+ scriptDepth++;
+ elementStack.push(name);
+ return;
+ }
+ if (name.equals("style")) {
+ styleDepth++;
+ elementStack.push(name);
+ return;
+ }
+
+ if (scriptDepth > 0 || styleDepth > 0) {
+ elementStack.push(name);
+ return;
+ }
+
+ elementStack.push(name);
+
+ switch (name) {
+ case "h1":
+ case "h2":
+ case "h3":
+ case "h4":
+ case "h5":
+ case "h6":
+ emitBlockSeparator();
+ int level = name.charAt(1) - '0';
+ write(repeatChar('#', level) + " ");
+ break;
+ case "p":
+ emitBlockSeparator();
+ break;
+ case "b":
+ case "strong":
+ write("**");
+ break;
+ case "i":
+ case "em":
+ write("*");
+ break;
+ case "a":
+ linkHref = atts.getValue("href");
+ linkText = new StringBuilder();
+ break;
+ case "img":
+ String alt = atts.getValue("alt");
+ String src = atts.getValue("src");
+ write(" + ")");
+ break;
+ case "ul":
+ case "ol":
+ if (!listStack.isEmpty()) {
+ // nested list — no extra block separator
+ } else {
+ emitBlockSeparator();
+ }
+ listStack.push(new ListState(name.equals("ol"),
listStack.size()));
+ break;
+ case "li":
+ if (!listStack.isEmpty()) {
+ ListState state = listStack.peek();
+ String indent = repeatChar(' ', state.depth * 4);
+ if (state.ordered) {
+ state.counter++;
+ write(indent + state.counter + ". ");
+ } else {
+ write(indent + "- ");
+ }
+ }
+ break;
+ case "blockquote":
+ emitBlockSeparator();
+ blockquoteDepth++;
+ break;
+ case "pre":
+ emitBlockSeparator();
+ inPreBlock = true;
+ write("```\n");
+ break;
+ case "code":
+ if (!inPreBlock) {
+ inInlineCode = true;
+ write("`");
+ }
+ break;
+ case "br":
+ write("\n");
+ atLineStart = true;
+ break;
+ case "hr":
+ emitBlockSeparator();
+ write("---");
+ needsBlockSeparator = true;
+ hasContent = true;
+ break;
+ case "table":
+ tableDepth++;
+ if (tableDepth == 1) {
+ emitBlockSeparator();
+ tableRows = new ArrayList<>();
+ }
+ break;
+ case "tr":
+ if (tableDepth == 1 && tableRows != null) {
+ currentRow = new ArrayList<>();
+ }
+ break;
+ case "th":
+ if (tableDepth == 1 && currentRow != null) {
+ currentCell = new StringBuilder();
+ }
+ break;
+ case "td":
+ if (tableDepth == 1 && currentRow != null) {
+ currentCell = new StringBuilder();
+ }
+ break;
+ case "dt":
+ emitBlockSeparator();
+ write("**");
+ break;
+ case "dd":
+ write("\n: ");
+ break;
+ case "div":
+ emitBlockSeparator();
+ break;
+ default:
+ // Ignore structural elements like html, head, body, title,
meta
+ break;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws
SAXException {
+ String name = localName(localName, qName);
+
+ if (!elementStack.isEmpty()) {
+ elementStack.pop();
+ }
+
+ // Track script/style depth
+ if (name.equals("script")) {
+ scriptDepth--;
+ return;
+ }
+ if (name.equals("style")) {
+ styleDepth--;
+ return;
+ }
+
+ if (scriptDepth > 0 || styleDepth > 0) {
+ return;
+ }
+
+ switch (name) {
+ case "h1":
+ case "h2":
+ case "h3":
+ case "h4":
+ case "h5":
+ case "h6":
+ needsBlockSeparator = true;
+ hasContent = true;
+ break;
+ case "p":
+ needsBlockSeparator = true;
+ hasContent = true;
+ break;
+ case "b":
+ case "strong":
+ write("**");
+ break;
+ case "i":
+ case "em":
+ write("*");
+ break;
+ case "a":
+ if (linkText != null) {
+ String text = linkText.toString();
+ String href = linkHref != null ? linkHref : "";
+ write("[" + text + "](" + href + ")");
+ linkText = null;
+ linkHref = null;
+ }
+ break;
+ case "ul":
+ case "ol":
+ if (!listStack.isEmpty()) {
+ listStack.pop();
+ }
+ if (listStack.isEmpty()) {
+ needsBlockSeparator = true;
+ hasContent = true;
+ }
+ break;
+ case "li":
+ write("\n");
+ atLineStart = true;
+ break;
+ case "blockquote":
+ blockquoteDepth--;
+ needsBlockSeparator = true;
+ hasContent = true;
+ break;
+ case "pre":
+ if (!endsWithNewline()) {
+ write("\n");
+ }
+ write("```");
+ inPreBlock = false;
+ needsBlockSeparator = true;
+ hasContent = true;
+ break;
+ case "code":
+ if (!inPreBlock) {
+ inInlineCode = false;
+ write("`");
+ }
+ break;
+ case "table":
+ if (tableDepth == 1) {
+ emitTable();
+ tableRows = null;
+ currentRow = null;
+ currentCell = null;
+ needsBlockSeparator = true;
+ hasContent = true;
+ }
+ tableDepth = Math.max(0, tableDepth - 1);
+ break;
+ case "tr":
+ if (tableDepth == 1 && tableRows != null && currentRow !=
null) {
+ tableRows.add(currentRow);
+ currentRow = null;
+ }
+ break;
+ case "th":
+ case "td":
+ if (tableDepth == 1 && currentRow != null && currentCell !=
null) {
+ currentRow.add(currentCell.toString().trim());
+ currentCell = null;
+ }
+ break;
+ case "dt":
+ write("**");
+ break;
+ case "dd":
+ needsBlockSeparator = true;
+ hasContent = true;
+ break;
+ case "div":
+ needsBlockSeparator = true;
+ hasContent = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws
SAXException {
+ if (scriptDepth > 0 || styleDepth > 0) {
+ return;
+ }
+
+ // Buffer into link text
+ if (linkText != null) {
+ linkText.append(ch, start, length);
+ return;
+ }
+
+ // Buffer into table cell
+ if (currentCell != null) {
+ currentCell.append(ch, start, length);
+ return;
+ }
+
+ String text = new String(ch, start, length);
+
+ // In pre blocks, write raw (no escaping)
+ if (inPreBlock) {
+ write(text);
+ return;
+ }
+
+ // In inline code, write raw (no escaping)
+ if (inInlineCode) {
+ write(text);
+ return;
+ }
+
+ // Escape markdown special characters in normal text
+ text = escapeMarkdown(text);
+
+ // Add blockquote prefix if needed at line start
+ if (blockquoteDepth > 0 && atLineStart && !text.isEmpty()) {
+ write(repeatChar('>', blockquoteDepth) + " ");
+ atLineStart = false;
+ }
+
+ if (!text.isEmpty()) {
+ write(text);
+ hasContent = true;
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws
SAXException {
+ characters(ch, start, length);
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ try {
+ writer.flush();
+ } catch (IOException e) {
+ throw new SAXException("Error flushing character output", e);
+ }
+ }
+
+ @Override
+ public String toString() {
+ return writer.toString();
+ }
+
+ private void write(String s) throws SAXException {
+ try {
+ writer.write(s);
+ if (!s.isEmpty()) {
+ atLineStart = s.charAt(s.length() - 1) == '\n';
+ }
+ } catch (IOException e) {
+ throw new SAXException("Error writing: " + s, e);
+ }
+ }
+
+ private void emitBlockSeparator() throws SAXException {
+ if (needsBlockSeparator && hasContent) {
+ write("\n\n");
+ needsBlockSeparator = false;
+ atLineStart = true;
+ }
+ }
+
+ private void emitTable() throws SAXException {
+ if (tableRows == null || tableRows.isEmpty()) {
+ return;
+ }
+
+ // Determine column count
+ int cols = 0;
+ for (List<String> row : tableRows) {
+ cols = Math.max(cols, row.size());
+ }
+
+ // Emit rows
+ for (int r = 0; r < tableRows.size(); r++) {
+ List<String> row = tableRows.get(r);
+ StringBuilder sb = new StringBuilder("|");
+ for (int c = 0; c < cols; c++) {
+ String cell = c < row.size() ? row.get(c) : "";
+ sb.append(" ").append(cell).append(" |");
+ }
+ write(sb.toString());
+ write("\n");
+
+ // Insert separator after first row
+ if (r == 0) {
+ StringBuilder sep = new StringBuilder("|");
+ for (int c = 0; c < cols; c++) {
+ sep.append(" --- |");
+ }
+ write(sep.toString());
+ write("\n");
+ }
+ }
+ }
+
+ private boolean endsWithNewline() {
+ String s = writer.toString();
+ return !s.isEmpty() && s.charAt(s.length() - 1) == '\n';
+ }
+
+ private static String escapeMarkdown(String text) {
+ StringBuilder sb = new StringBuilder(text.length());
+ for (int i = 0; i < text.length(); i++) {
+ char c = text.charAt(i);
+ switch (c) {
+ case '\\':
+ case '`':
+ case '*':
+ case '_':
+ case '[':
+ case ']':
+ case '#':
+ case '|':
+ sb.append('\\').append(c);
+ break;
+ default:
+ sb.append(c);
+ break;
+ }
+ }
+ return sb.toString();
+ }
+
+ private static String repeatChar(char c, int count) {
+ StringBuilder sb = new StringBuilder(count);
+ for (int i = 0; i < count; i++) {
+ sb.append(c);
+ }
+ return sb.toString();
+ }
+
+ private static String localName(String localName, String qName) {
+ if (localName != null && !localName.isEmpty()) {
+ return localName.toLowerCase(Locale.ROOT);
+ }
+ if (qName != null) {
+ // Strip namespace prefix
+ int colon = qName.indexOf(':');
+ String name = colon >= 0 ? qName.substring(colon + 1) : qName;
+ return name.toLowerCase(Locale.ROOT);
+ }
+ return "";
+ }
+
+ private static class ListState {
+ final boolean ordered;
+ final int depth;
+ int counter;
+
+ ListState(boolean ordered, int depth) {
+ this.ordered = ordered;
+ this.depth = depth;
+ this.counter = 0;
+ }
+ }
+}
diff --git
a/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java
b/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java
new file mode 100644
index 0000000000..822962a96c
--- /dev/null
+++
b/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java
@@ -0,0 +1,941 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Random;
+
+import org.junit.jupiter.api.RepeatedTest;
+import org.junit.jupiter.api.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Test cases for the {@link ToMarkdownContentHandler} class.
+ */
+public class ToMarkdownContentHandlerTest {
+
+ private static final String XHTML = "http://www.w3.org/1999/xhtml";
+ private static final Attributes EMPTY = new AttributesImpl();
+
+ private static void startElement(ContentHandler handler, String name)
throws Exception {
+ handler.startElement(XHTML, name, name, EMPTY);
+ }
+
+ private static void startElement(ContentHandler handler, String name,
String attrName,
+ String attrValue) throws Exception {
+ AttributesImpl atts = new AttributesImpl();
+ atts.addAttribute("", attrName, attrName, "CDATA", attrValue);
+ handler.startElement(XHTML, name, name, atts);
+ }
+
+ private static void startElement(ContentHandler handler, String name,
AttributesImpl atts)
+ throws Exception {
+ handler.startElement(XHTML, name, name, atts);
+ }
+
+ private static void endElement(ContentHandler handler, String name) throws
Exception {
+ handler.endElement(XHTML, name, name);
+ }
+
+ private static void chars(ContentHandler handler, String text) throws
Exception {
+ char[] ch = text.toCharArray();
+ handler.characters(ch, 0, ch.length);
+ }
+
+ @Test
+ public void testHeadings() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "h1");
+ chars(handler, "Title");
+ endElement(handler, "h1");
+
+ startElement(handler, "h2");
+ chars(handler, "Subtitle");
+ endElement(handler, "h2");
+
+ startElement(handler, "h3");
+ chars(handler, "Section");
+ endElement(handler, "h3");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("# Title"));
+ assertTrue(result.contains("## Subtitle"));
+ assertTrue(result.contains("### Section"));
+ }
+
+ @Test
+ public void testAllHeadingLevels() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ for (int i = 1; i <= 6; i++) {
+ startElement(handler, "h" + i);
+ chars(handler, "H" + i);
+ endElement(handler, "h" + i);
+ }
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("# H1"));
+ assertTrue(result.contains("## H2"));
+ assertTrue(result.contains("### H3"));
+ assertTrue(result.contains("#### H4"));
+ assertTrue(result.contains("##### H5"));
+ assertTrue(result.contains("###### H6"));
+ }
+
+ @Test
+ public void testParagraphs() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ chars(handler, "First paragraph.");
+ endElement(handler, "p");
+
+ startElement(handler, "p");
+ chars(handler, "Second paragraph.");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("First paragraph."));
+ assertTrue(result.contains("Second paragraph."));
+ // Paragraphs should be separated by blank line
+ assertTrue(result.contains("First paragraph.\n\nSecond paragraph."));
+ }
+
+ @Test
+ public void testBold() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ chars(handler, "This is ");
+ startElement(handler, "b");
+ chars(handler, "bold");
+ endElement(handler, "b");
+ chars(handler, " text.");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("**bold**"));
+ }
+
+ @Test
+ public void testStrong() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ startElement(handler, "strong");
+ chars(handler, "strong");
+ endElement(handler, "strong");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ assertTrue(handler.toString().contains("**strong**"));
+ }
+
+ @Test
+ public void testItalic() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ chars(handler, "This is ");
+ startElement(handler, "i");
+ chars(handler, "italic");
+ endElement(handler, "i");
+ chars(handler, " text.");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ assertTrue(handler.toString().contains("*italic*"));
+ }
+
+ @Test
+ public void testEmphasis() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ startElement(handler, "em");
+ chars(handler, "emphasized");
+ endElement(handler, "em");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ assertTrue(handler.toString().contains("*emphasized*"));
+ }
+
+ @Test
+ public void testLink() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ chars(handler, "Click ");
+ startElement(handler, "a", "href", "https://example.com");
+ chars(handler, "here");
+ endElement(handler, "a");
+ chars(handler, " for more.");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ assertTrue(handler.toString().contains("[here](https://example.com)"));
+ }
+
+ @Test
+ public void testImage() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ AttributesImpl atts = new AttributesImpl();
+ atts.addAttribute("", "alt", "alt", "CDATA", "A photo");
+ atts.addAttribute("", "src", "src", "CDATA", "photo.jpg");
+ startElement(handler, "img", atts);
+ endElement(handler, "img");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ assertTrue(handler.toString().contains(""));
+ }
+
+ @Test
+ public void testUnorderedList() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "ul");
+ startElement(handler, "li");
+ chars(handler, "Apple");
+ endElement(handler, "li");
+ startElement(handler, "li");
+ chars(handler, "Banana");
+ endElement(handler, "li");
+ startElement(handler, "li");
+ chars(handler, "Cherry");
+ endElement(handler, "li");
+ endElement(handler, "ul");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("- Apple"));
+ assertTrue(result.contains("- Banana"));
+ assertTrue(result.contains("- Cherry"));
+ }
+
+ @Test
+ public void testOrderedList() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "ol");
+ startElement(handler, "li");
+ chars(handler, "First");
+ endElement(handler, "li");
+ startElement(handler, "li");
+ chars(handler, "Second");
+ endElement(handler, "li");
+ startElement(handler, "li");
+ chars(handler, "Third");
+ endElement(handler, "li");
+ endElement(handler, "ol");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("1. First"));
+ assertTrue(result.contains("2. Second"));
+ assertTrue(result.contains("3. Third"));
+ }
+
+ @Test
+ public void testNestedLists() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "ul");
+ startElement(handler, "li");
+ chars(handler, "Fruit");
+
+ startElement(handler, "ul");
+ startElement(handler, "li");
+ chars(handler, "Apple");
+ endElement(handler, "li");
+ startElement(handler, "li");
+ chars(handler, "Banana");
+ endElement(handler, "li");
+ endElement(handler, "ul");
+
+ endElement(handler, "li");
+ startElement(handler, "li");
+ chars(handler, "Vegetable");
+ endElement(handler, "li");
+ endElement(handler, "ul");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("- Fruit"));
+ assertTrue(result.contains(" - Apple"));
+ assertTrue(result.contains(" - Banana"));
+ assertTrue(result.contains("- Vegetable"));
+ }
+
+ @Test
+ public void testTable() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "table");
+
+ // Header row
+ startElement(handler, "tr");
+ startElement(handler, "th");
+ chars(handler, "Name");
+ endElement(handler, "th");
+ startElement(handler, "th");
+ chars(handler, "Age");
+ endElement(handler, "th");
+ endElement(handler, "tr");
+
+ // Data row
+ startElement(handler, "tr");
+ startElement(handler, "td");
+ chars(handler, "Alice");
+ endElement(handler, "td");
+ startElement(handler, "td");
+ chars(handler, "30");
+ endElement(handler, "td");
+ endElement(handler, "tr");
+
+ endElement(handler, "table");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("| Name | Age |"));
+ assertTrue(result.contains("| --- | --- |"));
+ assertTrue(result.contains("| Alice | 30 |"));
+ }
+
+ @Test
+ public void testFencedCodeBlock() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "pre");
+ startElement(handler, "code");
+ chars(handler, "int x = 42;");
+ endElement(handler, "code");
+ endElement(handler, "pre");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("```\n"));
+ assertTrue(result.contains("int x = 42;"));
+ assertTrue(result.contains("\n```"));
+ }
+
+ @Test
+ public void testInlineCode() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ chars(handler, "Use the ");
+ startElement(handler, "code");
+ chars(handler, "println");
+ endElement(handler, "code");
+ chars(handler, " function.");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ assertTrue(handler.toString().contains("`println`"));
+ }
+
+ @Test
+ public void testBlockquote() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "blockquote");
+ chars(handler, "To be or not to be.");
+ endElement(handler, "blockquote");
+
+ handler.endDocument();
+
+ assertTrue(handler.toString().contains("> To be or not to be."));
+ }
+
+ @Test
+ public void testHorizontalRule() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ chars(handler, "Above");
+ endElement(handler, "p");
+
+ startElement(handler, "hr");
+ endElement(handler, "hr");
+
+ startElement(handler, "p");
+ chars(handler, "Below");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("---"));
+ assertTrue(result.contains("Above"));
+ assertTrue(result.contains("Below"));
+ }
+
+ @Test
+ public void testLineBreak() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ chars(handler, "Line one");
+ startElement(handler, "br");
+ endElement(handler, "br");
+ chars(handler, "Line two");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ assertTrue(handler.toString().contains("Line one\nLine two"));
+ }
+
+ @Test
+ public void testBoldInsideListItem() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "ul");
+ startElement(handler, "li");
+ startElement(handler, "b");
+ chars(handler, "Important");
+ endElement(handler, "b");
+ chars(handler, " item");
+ endElement(handler, "li");
+ endElement(handler, "ul");
+
+ handler.endDocument();
+
+ assertTrue(handler.toString().contains("- **Important** item"));
+ }
+
+ @Test
+ public void testLinkInsideHeading() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "h2");
+ startElement(handler, "a", "href", "https://example.com");
+ chars(handler, "Linked Title");
+ endElement(handler, "a");
+ endElement(handler, "h2");
+
+ handler.endDocument();
+
+ assertTrue(handler.toString().contains("## [Linked
Title](https://example.com)"));
+ }
+
+ @Test
+ public void testScriptContentSkipped() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ chars(handler, "Before");
+ endElement(handler, "p");
+
+ startElement(handler, "script");
+ chars(handler, "alert('xss');");
+ endElement(handler, "script");
+
+ startElement(handler, "p");
+ chars(handler, "After");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("Before"));
+ assertTrue(result.contains("After"));
+ assertFalse(result.contains("alert"));
+ }
+
+ @Test
+ public void testStyleContentSkipped() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ chars(handler, "Visible");
+ endElement(handler, "p");
+
+ startElement(handler, "style");
+ chars(handler, "body { color: red; }");
+ endElement(handler, "style");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("Visible"));
+ assertFalse(result.contains("color"));
+ }
+
+ @Test
+ public void testMarkdownEscaping() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ chars(handler, "Special chars: * _ [ ] # | \\ `");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("\\*"));
+ assertTrue(result.contains("\\_"));
+ assertTrue(result.contains("\\["));
+ assertTrue(result.contains("\\]"));
+ assertTrue(result.contains("\\#"));
+ assertTrue(result.contains("\\|"));
+ assertTrue(result.contains("\\\\"));
+ assertTrue(result.contains("\\`"));
+ }
+
+ @Test
+ public void testNoEscapingInCodeBlock() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "pre");
+ startElement(handler, "code");
+ chars(handler, "x * y = z");
+ endElement(handler, "code");
+ endElement(handler, "pre");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ // Inside code blocks, * should NOT be escaped
+ assertTrue(result.contains("x * y = z"));
+ assertFalse(result.contains("\\*"));
+ }
+
+ @Test
+ public void testNoEscapingInInlineCode() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ startElement(handler, "code");
+ chars(handler, "a*b");
+ endElement(handler, "code");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("`a*b`"));
+ }
+
+ @Test
+ public void testDefinitionList() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "dl");
+ startElement(handler, "dt");
+ chars(handler, "Term");
+ endElement(handler, "dt");
+ startElement(handler, "dd");
+ chars(handler, "Definition of the term");
+ endElement(handler, "dd");
+ endElement(handler, "dl");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("**Term**"));
+ assertTrue(result.contains(": Definition of the term"));
+ }
+
+ @Test
+ public void testDiv() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "div");
+ chars(handler, "Content in div");
+ endElement(handler, "div");
+
+ startElement(handler, "div");
+ chars(handler, "Another div");
+ endElement(handler, "div");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("Content in div"));
+ assertTrue(result.contains("Another div"));
+ // Divs should be separated
+ assertTrue(result.contains("Content in div\n\nAnother div"));
+ }
+
+ @Test
+ public void testHandlerTypeParsingMarkdown() {
+ assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,
+ BasicContentHandlerFactory.parseHandlerType("markdown",
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT));
+ assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,
+ BasicContentHandlerFactory.parseHandlerType("md",
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT));
+ assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,
+ BasicContentHandlerFactory.parseHandlerType("MARKDOWN",
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT));
+ assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,
+ BasicContentHandlerFactory.parseHandlerType("MD",
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT));
+ }
+
+ @Test
+ public void testFactoryCreatesMarkdownHandler() {
+ BasicContentHandlerFactory factory =
+ new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,
-1);
+ org.xml.sax.ContentHandler handler = factory.getNewContentHandler();
+ assertTrue(handler instanceof ToMarkdownContentHandler);
+ }
+
+ @Test
+ public void testTableWithOnlyTd() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "table");
+
+ startElement(handler, "tr");
+ startElement(handler, "td");
+ chars(handler, "A");
+ endElement(handler, "td");
+ startElement(handler, "td");
+ chars(handler, "B");
+ endElement(handler, "td");
+ endElement(handler, "tr");
+
+ startElement(handler, "tr");
+ startElement(handler, "td");
+ chars(handler, "C");
+ endElement(handler, "td");
+ startElement(handler, "td");
+ chars(handler, "D");
+ endElement(handler, "td");
+ endElement(handler, "tr");
+
+ endElement(handler, "table");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ assertTrue(result.contains("| A | B |"));
+ assertTrue(result.contains("| --- | --- |"));
+ assertTrue(result.contains("| C | D |"));
+ }
+
+ @Test
+ public void testNestedTablesIgnored() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "table");
+
+ // Outer header row
+ startElement(handler, "tr");
+ startElement(handler, "th");
+ chars(handler, "Outer1");
+ endElement(handler, "th");
+ startElement(handler, "th");
+ chars(handler, "Outer2");
+ endElement(handler, "th");
+ endElement(handler, "tr");
+
+ // Outer data row with nested table in second cell
+ startElement(handler, "tr");
+ startElement(handler, "td");
+ chars(handler, "A");
+ endElement(handler, "td");
+ startElement(handler, "td");
+ chars(handler, "B");
+
+ // Nested table -- should be ignored
+ startElement(handler, "table");
+ startElement(handler, "tr");
+ startElement(handler, "td");
+ chars(handler, "Inner");
+ endElement(handler, "td");
+ endElement(handler, "tr");
+ endElement(handler, "table");
+
+ endElement(handler, "td");
+ endElement(handler, "tr");
+
+ endElement(handler, "table");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ // Outer table should be rendered
+ assertTrue(result.contains("| Outer1 | Outer2 |"));
+ assertTrue(result.contains("| --- | --- |"));
+ // Inner cell text gets folded into the outer cell ("B" + "Inner" =
"BInner")
+ assertTrue(result.contains("| A | BInner |"));
+ // Inner table structure should not appear as a separate table
+ assertFalse(result.contains("| Inner |"));
+ }
+
+ private static final String[] ALL_ELEMENTS = {
+ "h1", "h2", "h3", "h4", "h5", "h6",
+ "p", "div", "span",
+ "b", "strong", "i", "em",
+ "a", "img",
+ "ul", "ol", "li",
+ "table", "tr", "th", "td",
+ "blockquote", "pre", "code",
+ "br", "hr",
+ "dl", "dt", "dd",
+ "script", "style",
+ "html", "head", "body", "title", "meta"
+ };
+
+ /**
+ * Randomized test: fire random sequences of
startElement/endElement/characters
+ * events with no guarantee of proper nesting. The handler must not throw
any
+ * runtime exceptions (e.g., EmptyStackException, NullPointerException,
+ * IndexOutOfBoundsException).
+ */
+ @RepeatedTest(20)
+ public void testRandomUnbalancedTags() throws Exception {
+ Random rng = new Random();
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+
+ assertDoesNotThrow(() -> {
+ handler.startDocument();
+
+ int numEvents = 50 + rng.nextInt(150);
+ for (int i = 0; i < numEvents; i++) {
+ int action = rng.nextInt(4);
+ String elem = ALL_ELEMENTS[rng.nextInt(ALL_ELEMENTS.length)];
+ switch (action) {
+ case 0:
+ // start element (possibly with attributes)
+ if (elem.equals("a")) {
+ startElement(handler, elem, "href",
"http://example.com");
+ } else if (elem.equals("img")) {
+ AttributesImpl atts = new AttributesImpl();
+ atts.addAttribute("", "src", "src", "CDATA",
"img.png");
+ atts.addAttribute("", "alt", "alt", "CDATA", "alt
text");
+ startElement(handler, elem, atts);
+ } else {
+ startElement(handler, elem);
+ }
+ break;
+ case 1:
+ // end element (possibly unmatched)
+ endElement(handler, elem);
+ break;
+ case 2:
+ // characters
+ chars(handler, "text_" + i);
+ break;
+ case 3:
+ // ignorable whitespace
+ char[] ws = " \t\n".toCharArray();
+ handler.ignorableWhitespace(ws, 0, ws.length);
+ break;
+ }
+ }
+
+ handler.endDocument();
+ });
+
+ // Just verify we can get the output without error
+ assertDoesNotThrow(() -> handler.toString());
+ }
+
+ /**
+ * Test extra endElement calls with no matching start -- should not throw.
+ */
+ @Test
+ public void testExtraEndElements() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+
+ assertDoesNotThrow(() -> {
+ handler.startDocument();
+
+ // End elements with no matching starts
+ endElement(handler, "p");
+ endElement(handler, "table");
+ endElement(handler, "tr");
+ endElement(handler, "td");
+ endElement(handler, "ul");
+ endElement(handler, "li");
+ endElement(handler, "a");
+ endElement(handler, "pre");
+ endElement(handler, "code");
+ endElement(handler, "blockquote");
+ endElement(handler, "b");
+ endElement(handler, "i");
+ endElement(handler, "script");
+ endElement(handler, "style");
+
+ handler.endDocument();
+ });
+ }
+
+ /**
+ * Test start elements with no matching end -- should not throw.
+ */
+ @Test
+ public void testUnclosedElements() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+
+ assertDoesNotThrow(() -> {
+ handler.startDocument();
+
+ startElement(handler, "p");
+ chars(handler, "unclosed paragraph");
+ startElement(handler, "b");
+ chars(handler, "unclosed bold");
+ startElement(handler, "a", "href", "http://example.com");
+ chars(handler, "unclosed link");
+ startElement(handler, "ul");
+ startElement(handler, "li");
+ chars(handler, "unclosed list item");
+ startElement(handler, "table");
+ startElement(handler, "tr");
+ startElement(handler, "td");
+ chars(handler, "unclosed cell");
+ startElement(handler, "blockquote");
+ chars(handler, "unclosed quote");
+ startElement(handler, "pre");
+ chars(handler, "unclosed pre");
+
+ handler.endDocument();
+ });
+ }
+
+ /**
+ * Test deeply nested elements of the same type -- should not throw.
+ */
+ @Test
+ public void testDeeplyNestedSameElement() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+
+ assertDoesNotThrow(() -> {
+ handler.startDocument();
+
+ // Deeply nested lists
+ for (int i = 0; i < 50; i++) {
+ startElement(handler, "ul");
+ startElement(handler, "li");
+ chars(handler, "level " + i);
+ }
+ for (int i = 0; i < 50; i++) {
+ endElement(handler, "li");
+ endElement(handler, "ul");
+ }
+
+ // Deeply nested blockquotes
+ for (int i = 0; i < 20; i++) {
+ startElement(handler, "blockquote");
+ }
+ chars(handler, "deep quote");
+ for (int i = 0; i < 20; i++) {
+ endElement(handler, "blockquote");
+ }
+
+ handler.endDocument();
+ });
+ }
+
+ /**
+ * Test interleaved (improperly nested) elements -- should not throw.
+ */
+ @Test
+ public void testInterleavedElements() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+
+ assertDoesNotThrow(() -> {
+ handler.startDocument();
+
+ // <b><i>text</b></i> -- improper nesting
+ startElement(handler, "b");
+ startElement(handler, "i");
+ chars(handler, "interleaved");
+ endElement(handler, "b");
+ endElement(handler, "i");
+
+ // <table><p>text</table></p>
+ startElement(handler, "table");
+ startElement(handler, "p");
+ chars(handler, "table with p");
+ endElement(handler, "table");
+ endElement(handler, "p");
+
+ // <ul><h1>text</ul></h1>
+ startElement(handler, "ul");
+ startElement(handler, "h1");
+ chars(handler, "list with heading");
+ endElement(handler, "ul");
+ endElement(handler, "h1");
+
+ handler.endDocument();
+ });
+ }
+}
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index 7f2d2df40f..6152bd0108 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -114,11 +114,12 @@ public class RecursiveMetadataResource {
* The extracted text content is stored with the key
* {@link org.apache.tika.metadata.TikaCoreProperties#TIKA_CONTENT}.
* <p>
- * Specify the handler for the content (xml, html, text, ignore)
+ * Specify the handler for the content (xml, html, text, markdown/md,
ignore)
* in the path:<br/>
* /rmeta/form (default: xml)<br/>
* /rmeta/form/xml (store the content as xml)<br/>
* /rmeta/form/text (store the content as text)<br/>
+ * /rmeta/form/md (store the content as markdown)<br/>
* /rmeta/form/ignore (don't record any content)<br/>
*
* @param att attachment
@@ -149,11 +150,12 @@ public class RecursiveMetadataResource {
* The extracted text content is stored with the key
* {@link org.apache.tika.metadata.TikaCoreProperties#TIKA_CONTENT}.
* <p>
- * Specify the handler for the content (xml, html, text, ignore)
+ * Specify the handler for the content (xml, html, text, markdown/md,
ignore)
* in the path:<br/>
* /rmeta (default: xml)<br/>
* /rmeta/xml (store the content as xml)<br/>
* /rmeta/text (store the content as text)<br/>
+ * /rmeta/md (store the content as markdown)<br/>
* /rmeta/ignore (don't record any content)<br/>
*
* @param info uri info
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index d25bfce3b6..26481575ef 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -78,6 +78,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.sax.RichTextContentHandler;
+import org.apache.tika.sax.ToMarkdownContentHandler;
import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
import org.apache.tika.server.core.CompositeParseContextConfig;
import org.apache.tika.server.core.InputStreamFactory;
@@ -511,6 +512,48 @@ public class TikaResource {
};
}
+ @PUT
+ @Consumes("*/*")
+ @Produces("text/plain")
+ @Path("md")
+ public StreamingOutput getMarkdown(final InputStream is, @Context
HttpHeaders httpHeaders,
+ @Context final UriInfo info) {
+ final Metadata metadata = new Metadata();
+ return produceMarkdown(getInputStream(is, metadata, httpHeaders,
info), metadata,
+ httpHeaders.getRequestHeaders(), info);
+ }
+
+ @POST
+ @Consumes("multipart/form-data")
+ @Produces("text/plain")
+ @Path("form/md")
+ public StreamingOutput getMarkdownFromMultipart(Attachment att,
+ @Context HttpHeaders
httpHeaders,
+ @Context final UriInfo
info) {
+ return produceMarkdown(att.getObject(InputStream.class), new
Metadata(),
+ preparePostHeaderMap(att, httpHeaders), info);
+ }
+
+ public StreamingOutput produceMarkdown(final InputStream is, final
Metadata metadata,
+ MultivaluedMap<String, String>
httpHeaders,
+ final UriInfo info) {
+ final Parser parser = createParser();
+ final ParseContext context = new ParseContext();
+
+ fillMetadata(parser, metadata, httpHeaders);
+ fillParseContext(httpHeaders, metadata, context);
+
+ logRequest(LOG, "/tika", metadata);
+
+ return outputStream -> {
+ Writer writer = new OutputStreamWriter(outputStream, UTF_8);
+
+ ContentHandler handler = new ToMarkdownContentHandler(writer);
+
+ parse(parser, LOG, info.getPath(), is, handler, metadata, context);
+ };
+ }
+
@POST
@Consumes("multipart/form-data")
@Produces("text/html")
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
index 78a8de25ba..103187e50d 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.server.standard;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -56,6 +57,7 @@ public class RecursiveMetadataResourceTest extends
CXFTestBase {
private static final String TEXT_PATH = "/text";
private static final String IGNORE_PATH = "/ignore";
private static final String XML_PATH = "/xml";
+ private static final String MD_PATH = "/md";
private static final String UNPARSEABLE_PATH = "/somethingOrOther";
private static final String SLASH = "/";
@@ -293,6 +295,21 @@ public class RecursiveMetadataResourceTest extends
CXFTestBase {
.trim();
assertTrue(content.startsWith("embed_3"));
+ //markdown
+ response = WebClient
+ .create(endPoint + META_PATH + MD_PATH)
+ .accept("application/json")
+
.put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ reader = new InputStreamReader((InputStream) response.getEntity(),
UTF_8);
+ metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ content = metadataList
+ .get(6)
+ .get(TikaCoreProperties.TIKA_CONTENT)
+ .trim();
+ assertFalse(content.startsWith("<html"));
+ assertContains("plundered our seas", content);
+
//ignore
response = WebClient
.create(endPoint + META_PATH + IGNORE_PATH)
@@ -381,6 +398,25 @@ public class RecursiveMetadataResourceTest extends
CXFTestBase {
.trim();
assertTrue(content.startsWith("embed_3"));
+ //markdown
+ attachmentPart =
+ new Attachment("myworddocx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ webClient = WebClient.create(endPoint + META_PATH + FORM_PATH +
MD_PATH);
+
+ response = webClient
+ .type("multipart/form-data")
+ .accept("application/json")
+ .post(attachmentPart);
+ reader = new InputStreamReader((InputStream) response.getEntity(),
UTF_8);
+ metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ content = metadataList
+ .get(6)
+ .get(TikaCoreProperties.TIKA_CONTENT)
+ .trim();
+ assertFalse(content.startsWith("<html"));
+ assertContains("plundered our seas", content);
+
//ignore -- no content
attachmentPart =
new Attachment("myworddocx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
index 6d20c1f6cd..0834402cdc 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
@@ -198,6 +198,19 @@ public class TikaResourceTest extends CXFTestBase {
assertEquals(UNPROCESSEABLE, response.getStatus());
}
+ @Test
+ public void testSimpleWordMarkdown() throws Exception {
+ Response response = WebClient
+ .create(endPoint + TIKA_PATH + "/md")
+ .type("application/msword")
+ .accept("text/plain")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
+ String responseMsg = getStringFromInputStream((InputStream)
response.getEntity());
+ assertTrue(responseMsg.contains("test"));
+ assertFalse(responseMsg.contains("<p"));
+ assertFalse(responseMsg.contains("<html"));
+ }
+
@Test
public void testSimpleWordHTML() throws Exception {
Response response = WebClient