This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4653-markdown-handler in repository https://gitbox.apache.org/repos/asf/tika.git
commit a637abb926bbe39c84cf17d4969b1ba675844959 Author: tallison <[email protected]> AuthorDate: Mon Feb 9 06:28:55 2026 -0500 TIKA-4653 - add markdown contenthandler --- .../apache/tika/sax/ToMarkdownContentHandler.java | 542 ++++++++++++ .../tika/sax/ToMarkdownContentHandlerTest.java | 941 +++++++++++++++++++++ .../core/resource/RecursiveMetadataResource.java | 18 +- .../tika/server/core/resource/TikaResource.java | 41 +- .../standard/RecursiveMetadataResourceTest.java | 38 + .../tika/server/standard/TikaResourceTest.java | 14 + 6 files changed, 1584 insertions(+), 10 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java new file mode 100644 index 0000000000..34e5e96cef --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.StringWriter; +import java.io.UnsupportedEncodingException; +import java.io.Writer; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Deque; +import java.util.List; +import java.util.Locale; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * SAX event handler that writes content as Markdown. + * Supports headings, paragraphs, bold, italic, links, images, lists (ordered + * and unordered, including nested), tables (GFM pipe tables), code blocks, + * inline code, blockquotes, horizontal rules, and definition lists. + * <p> + * Content within <script> and <style> tags is ignored. + * </p> + * + * @since Apache Tika 3.2 + */ +public class ToMarkdownContentHandler extends DefaultHandler { + + private static final String STYLE = "STYLE"; + private static final String SCRIPT = "SCRIPT"; + + private final Writer writer; + + private final Deque<String> elementStack = new ArrayDeque<>(); + private final Deque<ListState> listStack = new ArrayDeque<>(); + + // Link buffering + private StringBuilder linkText; + private String linkHref; + + // Table buffering (only the outermost table is rendered; nested tables are ignored) + private int tableDepth = 0; + private List<List<String>> tableRows; + private List<String> currentRow; + private StringBuilder currentCell; + + // Blockquote + private int blockquoteDepth = 0; + + // Code + private boolean inPreBlock = false; + private boolean inInlineCode = false; + + // Script/style suppression + private int scriptDepth = 0; + private int styleDepth = 0; + + // Spacing + private boolean needsBlockSeparator = false; + private boolean atLineStart = true; + + // Track if we've written any content at all + private boolean hasContent = false; + + public ToMarkdownContentHandler(Writer writer) { + this.writer = writer; + } + + public ToMarkdownContentHandler(OutputStream stream, String encoding) + throws UnsupportedEncodingException { + this(new OutputStreamWriter(stream, encoding)); + } + + public ToMarkdownContentHandler() { + this(new StringWriter()); + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + String name = localName(localName, qName); + + // Track script/style depth + if (name.equals("script")) { + scriptDepth++; + elementStack.push(name); + return; + } + if (name.equals("style")) { + styleDepth++; + elementStack.push(name); + return; + } + + if (scriptDepth > 0 || styleDepth > 0) { + elementStack.push(name); + return; + } + + elementStack.push(name); + + switch (name) { + case "h1": + case "h2": + case "h3": + case "h4": + case "h5": + case "h6": + emitBlockSeparator(); + int level = name.charAt(1) - '0'; + write(repeatChar('#', level) + " "); + break; + case "p": + emitBlockSeparator(); + break; + case "b": + case "strong": + write("**"); + break; + case "i": + case "em": + write("*"); + break; + case "a": + linkHref = atts.getValue("href"); + linkText = new StringBuilder(); + break; + case "img": + String alt = atts.getValue("alt"); + String src = atts.getValue("src"); + write(" + ")"); + break; + case "ul": + case "ol": + if (!listStack.isEmpty()) { + // nested list — no extra block separator + } else { + emitBlockSeparator(); + } + listStack.push(new ListState(name.equals("ol"), listStack.size())); + break; + case "li": + if (!listStack.isEmpty()) { + ListState state = listStack.peek(); + String indent = repeatChar(' ', state.depth * 4); + if (state.ordered) { + state.counter++; + write(indent + state.counter + ". "); + } else { + write(indent + "- "); + } + } + break; + case "blockquote": + emitBlockSeparator(); + blockquoteDepth++; + break; + case "pre": + emitBlockSeparator(); + inPreBlock = true; + write("```\n"); + break; + case "code": + if (!inPreBlock) { + inInlineCode = true; + write("`"); + } + break; + case "br": + write("\n"); + atLineStart = true; + break; + case "hr": + emitBlockSeparator(); + write("---"); + needsBlockSeparator = true; + hasContent = true; + break; + case "table": + tableDepth++; + if (tableDepth == 1) { + emitBlockSeparator(); + tableRows = new ArrayList<>(); + } + break; + case "tr": + if (tableDepth == 1 && tableRows != null) { + currentRow = new ArrayList<>(); + } + break; + case "th": + if (tableDepth == 1 && currentRow != null) { + currentCell = new StringBuilder(); + } + break; + case "td": + if (tableDepth == 1 && currentRow != null) { + currentCell = new StringBuilder(); + } + break; + case "dt": + emitBlockSeparator(); + write("**"); + break; + case "dd": + write("\n: "); + break; + case "div": + emitBlockSeparator(); + break; + default: + // Ignore structural elements like html, head, body, title, meta + break; + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + String name = localName(localName, qName); + + if (!elementStack.isEmpty()) { + elementStack.pop(); + } + + // Track script/style depth + if (name.equals("script")) { + scriptDepth--; + return; + } + if (name.equals("style")) { + styleDepth--; + return; + } + + if (scriptDepth > 0 || styleDepth > 0) { + return; + } + + switch (name) { + case "h1": + case "h2": + case "h3": + case "h4": + case "h5": + case "h6": + needsBlockSeparator = true; + hasContent = true; + break; + case "p": + needsBlockSeparator = true; + hasContent = true; + break; + case "b": + case "strong": + write("**"); + break; + case "i": + case "em": + write("*"); + break; + case "a": + if (linkText != null) { + String text = linkText.toString(); + String href = linkHref != null ? linkHref : ""; + write("[" + text + "](" + href + ")"); + linkText = null; + linkHref = null; + } + break; + case "ul": + case "ol": + if (!listStack.isEmpty()) { + listStack.pop(); + } + if (listStack.isEmpty()) { + needsBlockSeparator = true; + hasContent = true; + } + break; + case "li": + write("\n"); + atLineStart = true; + break; + case "blockquote": + blockquoteDepth--; + needsBlockSeparator = true; + hasContent = true; + break; + case "pre": + if (!endsWithNewline()) { + write("\n"); + } + write("```"); + inPreBlock = false; + needsBlockSeparator = true; + hasContent = true; + break; + case "code": + if (!inPreBlock) { + inInlineCode = false; + write("`"); + } + break; + case "table": + if (tableDepth == 1) { + emitTable(); + tableRows = null; + currentRow = null; + currentCell = null; + needsBlockSeparator = true; + hasContent = true; + } + tableDepth = Math.max(0, tableDepth - 1); + break; + case "tr": + if (tableDepth == 1 && tableRows != null && currentRow != null) { + tableRows.add(currentRow); + currentRow = null; + } + break; + case "th": + case "td": + if (tableDepth == 1 && currentRow != null && currentCell != null) { + currentRow.add(currentCell.toString().trim()); + currentCell = null; + } + break; + case "dt": + write("**"); + break; + case "dd": + needsBlockSeparator = true; + hasContent = true; + break; + case "div": + needsBlockSeparator = true; + hasContent = true; + break; + default: + break; + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (scriptDepth > 0 || styleDepth > 0) { + return; + } + + // Buffer into link text + if (linkText != null) { + linkText.append(ch, start, length); + return; + } + + // Buffer into table cell + if (currentCell != null) { + currentCell.append(ch, start, length); + return; + } + + String text = new String(ch, start, length); + + // In pre blocks, write raw (no escaping) + if (inPreBlock) { + write(text); + return; + } + + // In inline code, write raw (no escaping) + if (inInlineCode) { + write(text); + return; + } + + // Escape markdown special characters in normal text + text = escapeMarkdown(text); + + // Add blockquote prefix if needed at line start + if (blockquoteDepth > 0 && atLineStart && !text.isEmpty()) { + write(repeatChar('>', blockquoteDepth) + " "); + atLineStart = false; + } + + if (!text.isEmpty()) { + write(text); + hasContent = true; + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + characters(ch, start, length); + } + + @Override + public void endDocument() throws SAXException { + try { + writer.flush(); + } catch (IOException e) { + throw new SAXException("Error flushing character output", e); + } + } + + @Override + public String toString() { + return writer.toString(); + } + + private void write(String s) throws SAXException { + try { + writer.write(s); + if (!s.isEmpty()) { + atLineStart = s.charAt(s.length() - 1) == '\n'; + } + } catch (IOException e) { + throw new SAXException("Error writing: " + s, e); + } + } + + private void emitBlockSeparator() throws SAXException { + if (needsBlockSeparator && hasContent) { + write("\n\n"); + needsBlockSeparator = false; + atLineStart = true; + } + } + + private void emitTable() throws SAXException { + if (tableRows == null || tableRows.isEmpty()) { + return; + } + + // Determine column count + int cols = 0; + for (List<String> row : tableRows) { + cols = Math.max(cols, row.size()); + } + + // Emit rows + for (int r = 0; r < tableRows.size(); r++) { + List<String> row = tableRows.get(r); + StringBuilder sb = new StringBuilder("|"); + for (int c = 0; c < cols; c++) { + String cell = c < row.size() ? row.get(c) : ""; + sb.append(" ").append(cell).append(" |"); + } + write(sb.toString()); + write("\n"); + + // Insert separator after first row + if (r == 0) { + StringBuilder sep = new StringBuilder("|"); + for (int c = 0; c < cols; c++) { + sep.append(" --- |"); + } + write(sep.toString()); + write("\n"); + } + } + } + + private boolean endsWithNewline() { + String s = writer.toString(); + return !s.isEmpty() && s.charAt(s.length() - 1) == '\n'; + } + + private static String escapeMarkdown(String text) { + StringBuilder sb = new StringBuilder(text.length()); + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + switch (c) { + case '\\': + case '`': + case '*': + case '_': + case '[': + case ']': + case '#': + case '|': + sb.append('\\').append(c); + break; + default: + sb.append(c); + break; + } + } + return sb.toString(); + } + + private static String repeatChar(char c, int count) { + StringBuilder sb = new StringBuilder(count); + for (int i = 0; i < count; i++) { + sb.append(c); + } + return sb.toString(); + } + + private static String localName(String localName, String qName) { + if (localName != null && !localName.isEmpty()) { + return localName.toLowerCase(Locale.ROOT); + } + if (qName != null) { + // Strip namespace prefix + int colon = qName.indexOf(':'); + String name = colon >= 0 ? qName.substring(colon + 1) : qName; + return name.toLowerCase(Locale.ROOT); + } + return ""; + } + + private static class ListState { + final boolean ordered; + final int depth; + int counter; + + ListState(boolean ordered, int depth) { + this.ordered = ordered; + this.depth = depth; + this.counter = 0; + } + } +} diff --git a/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java new file mode 100644 index 0000000000..1ba3523a23 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java @@ -0,0 +1,941 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Random; + +import org.junit.jupiter.api.RepeatedTest; +import org.junit.jupiter.api.Test; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Test cases for the {@link ToMarkdownContentHandler} class. + */ +public class ToMarkdownContentHandlerTest { + + private static final String XHTML = "http://www.w3.org/1999/xhtml"; + private static final Attributes EMPTY = new AttributesImpl(); + + private static void startElement(ContentHandler handler, String name) throws Exception { + handler.startElement(XHTML, name, name, EMPTY); + } + + private static void startElement(ContentHandler handler, String name, String attrName, + String attrValue) throws Exception { + AttributesImpl atts = new AttributesImpl(); + atts.addAttribute("", attrName, attrName, "CDATA", attrValue); + handler.startElement(XHTML, name, name, atts); + } + + private static void startElement(ContentHandler handler, String name, AttributesImpl atts) + throws Exception { + handler.startElement(XHTML, name, name, atts); + } + + private static void endElement(ContentHandler handler, String name) throws Exception { + handler.endElement(XHTML, name, name); + } + + private static void chars(ContentHandler handler, String text) throws Exception { + char[] ch = text.toCharArray(); + handler.characters(ch, 0, ch.length); + } + + @Test + public void testHeadings() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "h1"); + chars(handler, "Title"); + endElement(handler, "h1"); + + startElement(handler, "h2"); + chars(handler, "Subtitle"); + endElement(handler, "h2"); + + startElement(handler, "h3"); + chars(handler, "Section"); + endElement(handler, "h3"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("# Title")); + assertTrue(result.contains("## Subtitle")); + assertTrue(result.contains("### Section")); + } + + @Test + public void testAllHeadingLevels() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + for (int i = 1; i <= 6; i++) { + startElement(handler, "h" + i); + chars(handler, "H" + i); + endElement(handler, "h" + i); + } + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("# H1")); + assertTrue(result.contains("## H2")); + assertTrue(result.contains("### H3")); + assertTrue(result.contains("#### H4")); + assertTrue(result.contains("##### H5")); + assertTrue(result.contains("###### H6")); + } + + @Test + public void testParagraphs() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + chars(handler, "First paragraph."); + endElement(handler, "p"); + + startElement(handler, "p"); + chars(handler, "Second paragraph."); + endElement(handler, "p"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("First paragraph.")); + assertTrue(result.contains("Second paragraph.")); + // Paragraphs should be separated by blank line + assertTrue(result.contains("First paragraph.\n\nSecond paragraph.")); + } + + @Test + public void testBold() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + chars(handler, "This is "); + startElement(handler, "b"); + chars(handler, "bold"); + endElement(handler, "b"); + chars(handler, " text."); + endElement(handler, "p"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("**bold**")); + } + + @Test + public void testStrong() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + startElement(handler, "strong"); + chars(handler, "strong"); + endElement(handler, "strong"); + endElement(handler, "p"); + + handler.endDocument(); + + assertTrue(handler.toString().contains("**strong**")); + } + + @Test + public void testItalic() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + chars(handler, "This is "); + startElement(handler, "i"); + chars(handler, "italic"); + endElement(handler, "i"); + chars(handler, " text."); + endElement(handler, "p"); + + handler.endDocument(); + + assertTrue(handler.toString().contains("*italic*")); + } + + @Test + public void testEmphasis() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + startElement(handler, "em"); + chars(handler, "emphasized"); + endElement(handler, "em"); + endElement(handler, "p"); + + handler.endDocument(); + + assertTrue(handler.toString().contains("*emphasized*")); + } + + @Test + public void testLink() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + chars(handler, "Click "); + startElement(handler, "a", "href", "https://example.com"); + chars(handler, "here"); + endElement(handler, "a"); + chars(handler, " for more."); + endElement(handler, "p"); + + handler.endDocument(); + + assertTrue(handler.toString().contains("[here](https://example.com)")); + } + + @Test + public void testImage() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + AttributesImpl atts = new AttributesImpl(); + atts.addAttribute("", "alt", "alt", "CDATA", "A photo"); + atts.addAttribute("", "src", "src", "CDATA", "photo.jpg"); + startElement(handler, "img", atts); + endElement(handler, "img"); + endElement(handler, "p"); + + handler.endDocument(); + + assertTrue(handler.toString().contains("")); + } + + @Test + public void testUnorderedList() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "ul"); + startElement(handler, "li"); + chars(handler, "Apple"); + endElement(handler, "li"); + startElement(handler, "li"); + chars(handler, "Banana"); + endElement(handler, "li"); + startElement(handler, "li"); + chars(handler, "Cherry"); + endElement(handler, "li"); + endElement(handler, "ul"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("- Apple")); + assertTrue(result.contains("- Banana")); + assertTrue(result.contains("- Cherry")); + } + + @Test + public void testOrderedList() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "ol"); + startElement(handler, "li"); + chars(handler, "First"); + endElement(handler, "li"); + startElement(handler, "li"); + chars(handler, "Second"); + endElement(handler, "li"); + startElement(handler, "li"); + chars(handler, "Third"); + endElement(handler, "li"); + endElement(handler, "ol"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("1. First")); + assertTrue(result.contains("2. Second")); + assertTrue(result.contains("3. Third")); + } + + @Test + public void testNestedLists() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "ul"); + startElement(handler, "li"); + chars(handler, "Fruit"); + + startElement(handler, "ul"); + startElement(handler, "li"); + chars(handler, "Apple"); + endElement(handler, "li"); + startElement(handler, "li"); + chars(handler, "Banana"); + endElement(handler, "li"); + endElement(handler, "ul"); + + endElement(handler, "li"); + startElement(handler, "li"); + chars(handler, "Vegetable"); + endElement(handler, "li"); + endElement(handler, "ul"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("- Fruit")); + assertTrue(result.contains(" - Apple")); + assertTrue(result.contains(" - Banana")); + assertTrue(result.contains("- Vegetable")); + } + + @Test + public void testTable() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "table"); + + // Header row + startElement(handler, "tr"); + startElement(handler, "th"); + chars(handler, "Name"); + endElement(handler, "th"); + startElement(handler, "th"); + chars(handler, "Age"); + endElement(handler, "th"); + endElement(handler, "tr"); + + // Data row + startElement(handler, "tr"); + startElement(handler, "td"); + chars(handler, "Alice"); + endElement(handler, "td"); + startElement(handler, "td"); + chars(handler, "30"); + endElement(handler, "td"); + endElement(handler, "tr"); + + endElement(handler, "table"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("| Name | Age |")); + assertTrue(result.contains("| --- | --- |")); + assertTrue(result.contains("| Alice | 30 |")); + } + + @Test + public void testFencedCodeBlock() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "pre"); + startElement(handler, "code"); + chars(handler, "int x = 42;"); + endElement(handler, "code"); + endElement(handler, "pre"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("```\n")); + assertTrue(result.contains("int x = 42;")); + assertTrue(result.contains("\n```")); + } + + @Test + public void testInlineCode() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + chars(handler, "Use the "); + startElement(handler, "code"); + chars(handler, "println"); + endElement(handler, "code"); + chars(handler, " function."); + endElement(handler, "p"); + + handler.endDocument(); + + assertTrue(handler.toString().contains("`println`")); + } + + @Test + public void testBlockquote() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "blockquote"); + chars(handler, "To be or not to be."); + endElement(handler, "blockquote"); + + handler.endDocument(); + + assertTrue(handler.toString().contains("> To be or not to be.")); + } + + @Test + public void testHorizontalRule() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + chars(handler, "Above"); + endElement(handler, "p"); + + startElement(handler, "hr"); + endElement(handler, "hr"); + + startElement(handler, "p"); + chars(handler, "Below"); + endElement(handler, "p"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("---")); + assertTrue(result.contains("Above")); + assertTrue(result.contains("Below")); + } + + @Test + public void testLineBreak() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + chars(handler, "Line one"); + startElement(handler, "br"); + endElement(handler, "br"); + chars(handler, "Line two"); + endElement(handler, "p"); + + handler.endDocument(); + + assertTrue(handler.toString().contains("Line one\nLine two")); + } + + @Test + public void testBoldInsideListItem() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "ul"); + startElement(handler, "li"); + startElement(handler, "b"); + chars(handler, "Important"); + endElement(handler, "b"); + chars(handler, " item"); + endElement(handler, "li"); + endElement(handler, "ul"); + + handler.endDocument(); + + assertTrue(handler.toString().contains("- **Important** item")); + } + + @Test + public void testLinkInsideHeading() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "h2"); + startElement(handler, "a", "href", "https://example.com"); + chars(handler, "Linked Title"); + endElement(handler, "a"); + endElement(handler, "h2"); + + handler.endDocument(); + + assertTrue(handler.toString().contains("## [Linked Title](https://example.com)")); + } + + @Test + public void testScriptContentSkipped() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + chars(handler, "Before"); + endElement(handler, "p"); + + startElement(handler, "script"); + chars(handler, "alert('xss');"); + endElement(handler, "script"); + + startElement(handler, "p"); + chars(handler, "After"); + endElement(handler, "p"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("Before")); + assertTrue(result.contains("After")); + assertFalse(result.contains("alert")); + } + + @Test + public void testStyleContentSkipped() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + chars(handler, "Visible"); + endElement(handler, "p"); + + startElement(handler, "style"); + chars(handler, "body { color: red; }"); + endElement(handler, "style"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("Visible")); + assertFalse(result.contains("color")); + } + + @Test + public void testMarkdownEscaping() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + chars(handler, "Special chars: * _ [ ] # | \\ `"); + endElement(handler, "p"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("\\*")); + assertTrue(result.contains("\\_")); + assertTrue(result.contains("\\[")); + assertTrue(result.contains("\\]")); + assertTrue(result.contains("\\#")); + assertTrue(result.contains("\\|")); + assertTrue(result.contains("\\\\")); + assertTrue(result.contains("\\`")); + } + + @Test + public void testNoEscapingInCodeBlock() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "pre"); + startElement(handler, "code"); + chars(handler, "x * y = z"); + endElement(handler, "code"); + endElement(handler, "pre"); + + handler.endDocument(); + + String result = handler.toString(); + // Inside code blocks, * should NOT be escaped + assertTrue(result.contains("x * y = z")); + assertFalse(result.contains("\\*")); + } + + @Test + public void testNoEscapingInInlineCode() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "p"); + startElement(handler, "code"); + chars(handler, "a*b"); + endElement(handler, "code"); + endElement(handler, "p"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("`a*b`")); + } + + @Test + public void testDefinitionList() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "dl"); + startElement(handler, "dt"); + chars(handler, "Term"); + endElement(handler, "dt"); + startElement(handler, "dd"); + chars(handler, "Definition of the term"); + endElement(handler, "dd"); + endElement(handler, "dl"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("**Term**")); + assertTrue(result.contains(": Definition of the term")); + } + + @Test + public void testDiv() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "div"); + chars(handler, "Content in div"); + endElement(handler, "div"); + + startElement(handler, "div"); + chars(handler, "Another div"); + endElement(handler, "div"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("Content in div")); + assertTrue(result.contains("Another div")); + // Divs should be separated + assertTrue(result.contains("Content in div\n\nAnother div")); + } + + @Test + public void testHandlerTypeParsingMarkdown() { + assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, + BasicContentHandlerFactory.parseHandlerType("markdown", + BasicContentHandlerFactory.HANDLER_TYPE.TEXT)); + assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, + BasicContentHandlerFactory.parseHandlerType("md", + BasicContentHandlerFactory.HANDLER_TYPE.TEXT)); + assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, + BasicContentHandlerFactory.parseHandlerType("MARKDOWN", + BasicContentHandlerFactory.HANDLER_TYPE.TEXT)); + assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, + BasicContentHandlerFactory.parseHandlerType("MD", + BasicContentHandlerFactory.HANDLER_TYPE.TEXT)); + } + + @Test + public void testFactoryCreatesMarkdownHandler() { + BasicContentHandlerFactory factory = + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, -1); + org.xml.sax.ContentHandler handler = factory.createHandler(); + assertTrue(handler instanceof ToMarkdownContentHandler); + } + + @Test + public void testTableWithOnlyTd() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "table"); + + startElement(handler, "tr"); + startElement(handler, "td"); + chars(handler, "A"); + endElement(handler, "td"); + startElement(handler, "td"); + chars(handler, "B"); + endElement(handler, "td"); + endElement(handler, "tr"); + + startElement(handler, "tr"); + startElement(handler, "td"); + chars(handler, "C"); + endElement(handler, "td"); + startElement(handler, "td"); + chars(handler, "D"); + endElement(handler, "td"); + endElement(handler, "tr"); + + endElement(handler, "table"); + + handler.endDocument(); + + String result = handler.toString(); + assertTrue(result.contains("| A | B |")); + assertTrue(result.contains("| --- | --- |")); + assertTrue(result.contains("| C | D |")); + } + + @Test + public void testNestedTablesIgnored() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + handler.startDocument(); + + startElement(handler, "table"); + + // Outer header row + startElement(handler, "tr"); + startElement(handler, "th"); + chars(handler, "Outer1"); + endElement(handler, "th"); + startElement(handler, "th"); + chars(handler, "Outer2"); + endElement(handler, "th"); + endElement(handler, "tr"); + + // Outer data row with nested table in second cell + startElement(handler, "tr"); + startElement(handler, "td"); + chars(handler, "A"); + endElement(handler, "td"); + startElement(handler, "td"); + chars(handler, "B"); + + // Nested table -- should be ignored + startElement(handler, "table"); + startElement(handler, "tr"); + startElement(handler, "td"); + chars(handler, "Inner"); + endElement(handler, "td"); + endElement(handler, "tr"); + endElement(handler, "table"); + + endElement(handler, "td"); + endElement(handler, "tr"); + + endElement(handler, "table"); + + handler.endDocument(); + + String result = handler.toString(); + // Outer table should be rendered + assertTrue(result.contains("| Outer1 | Outer2 |")); + assertTrue(result.contains("| --- | --- |")); + // Inner cell text gets folded into the outer cell ("B" + "Inner" = "BInner") + assertTrue(result.contains("| A | BInner |")); + // Inner table structure should not appear as a separate table + assertFalse(result.contains("| Inner |")); + } + + private static final String[] ALL_ELEMENTS = { + "h1", "h2", "h3", "h4", "h5", "h6", + "p", "div", "span", + "b", "strong", "i", "em", + "a", "img", + "ul", "ol", "li", + "table", "tr", "th", "td", + "blockquote", "pre", "code", + "br", "hr", + "dl", "dt", "dd", + "script", "style", + "html", "head", "body", "title", "meta" + }; + + /** + * Randomized test: fire random sequences of startElement/endElement/characters + * events with no guarantee of proper nesting. The handler must not throw any + * runtime exceptions (e.g., EmptyStackException, NullPointerException, + * IndexOutOfBoundsException). + */ + @RepeatedTest(20) + public void testRandomUnbalancedTags() throws Exception { + Random rng = new Random(); + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + + assertDoesNotThrow(() -> { + handler.startDocument(); + + int numEvents = 50 + rng.nextInt(150); + for (int i = 0; i < numEvents; i++) { + int action = rng.nextInt(4); + String elem = ALL_ELEMENTS[rng.nextInt(ALL_ELEMENTS.length)]; + switch (action) { + case 0: + // start element (possibly with attributes) + if (elem.equals("a")) { + startElement(handler, elem, "href", "http://example.com"); + } else if (elem.equals("img")) { + AttributesImpl atts = new AttributesImpl(); + atts.addAttribute("", "src", "src", "CDATA", "img.png"); + atts.addAttribute("", "alt", "alt", "CDATA", "alt text"); + startElement(handler, elem, atts); + } else { + startElement(handler, elem); + } + break; + case 1: + // end element (possibly unmatched) + endElement(handler, elem); + break; + case 2: + // characters + chars(handler, "text_" + i); + break; + case 3: + // ignorable whitespace + char[] ws = " \t\n".toCharArray(); + handler.ignorableWhitespace(ws, 0, ws.length); + break; + } + } + + handler.endDocument(); + }); + + // Just verify we can get the output without error + assertDoesNotThrow(() -> handler.toString()); + } + + /** + * Test extra endElement calls with no matching start -- should not throw. + */ + @Test + public void testExtraEndElements() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + + assertDoesNotThrow(() -> { + handler.startDocument(); + + // End elements with no matching starts + endElement(handler, "p"); + endElement(handler, "table"); + endElement(handler, "tr"); + endElement(handler, "td"); + endElement(handler, "ul"); + endElement(handler, "li"); + endElement(handler, "a"); + endElement(handler, "pre"); + endElement(handler, "code"); + endElement(handler, "blockquote"); + endElement(handler, "b"); + endElement(handler, "i"); + endElement(handler, "script"); + endElement(handler, "style"); + + handler.endDocument(); + }); + } + + /** + * Test start elements with no matching end -- should not throw. + */ + @Test + public void testUnclosedElements() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + + assertDoesNotThrow(() -> { + handler.startDocument(); + + startElement(handler, "p"); + chars(handler, "unclosed paragraph"); + startElement(handler, "b"); + chars(handler, "unclosed bold"); + startElement(handler, "a", "href", "http://example.com"); + chars(handler, "unclosed link"); + startElement(handler, "ul"); + startElement(handler, "li"); + chars(handler, "unclosed list item"); + startElement(handler, "table"); + startElement(handler, "tr"); + startElement(handler, "td"); + chars(handler, "unclosed cell"); + startElement(handler, "blockquote"); + chars(handler, "unclosed quote"); + startElement(handler, "pre"); + chars(handler, "unclosed pre"); + + handler.endDocument(); + }); + } + + /** + * Test deeply nested elements of the same type -- should not throw. + */ + @Test + public void testDeeplyNestedSameElement() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + + assertDoesNotThrow(() -> { + handler.startDocument(); + + // Deeply nested lists + for (int i = 0; i < 50; i++) { + startElement(handler, "ul"); + startElement(handler, "li"); + chars(handler, "level " + i); + } + for (int i = 0; i < 50; i++) { + endElement(handler, "li"); + endElement(handler, "ul"); + } + + // Deeply nested blockquotes + for (int i = 0; i < 20; i++) { + startElement(handler, "blockquote"); + } + chars(handler, "deep quote"); + for (int i = 0; i < 20; i++) { + endElement(handler, "blockquote"); + } + + handler.endDocument(); + }); + } + + /** + * Test interleaved (improperly nested) elements -- should not throw. + */ + @Test + public void testInterleavedElements() throws Exception { + ToMarkdownContentHandler handler = new ToMarkdownContentHandler(); + + assertDoesNotThrow(() -> { + handler.startDocument(); + + // <b><i>text</b></i> -- improper nesting + startElement(handler, "b"); + startElement(handler, "i"); + chars(handler, "interleaved"); + endElement(handler, "b"); + endElement(handler, "i"); + + // <table><p>text</table></p> + startElement(handler, "table"); + startElement(handler, "p"); + chars(handler, "table with p"); + endElement(handler, "table"); + endElement(handler, "p"); + + // <ul><h1>text</ul></h1> + startElement(handler, "ul"); + startElement(handler, "h1"); + chars(handler, "list with heading"); + endElement(handler, "ul"); + endElement(handler, "h1"); + + handler.endDocument(); + }); + } +} diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java index 7fbf7f2566..a129e47f30 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java @@ -106,12 +106,13 @@ public class RecursiveMetadataResource { * The extracted text content is stored with the key * {@link org.apache.tika.metadata.TikaCoreProperties#TIKA_CONTENT}. * <p> - * Specify the handler for the content (xml, html, text, ignore) + * Specify the handler for the content (xml, html, text, markdown, ignore) * in the path:<br/> * /rmeta/form (default: xml)<br/> - * /rmeta/form/xml (store the content as xml)<br/> - * /rmeta/form/text (store the content as text)<br/> - * /rmeta/form/ignore (don't record any content)<br/> + * /rmeta/form/xml (store the content as xml)<br/> + * /rmeta/form/text (store the content as text)<br/> + * /rmeta/form/markdown (store the content as markdown)<br/> + * /rmeta/form/ignore (don't record any content)<br/> * * @param att attachment * @param info uri info @@ -182,12 +183,13 @@ public class RecursiveMetadataResource { * The extracted text content is stored with the key * {@link org.apache.tika.metadata.TikaCoreProperties#TIKA_CONTENT}. * <p> - * Specify the handler for the content (xml, html, text, ignore) + * Specify the handler for the content (xml, html, text, markdown, ignore) * in the path:<br/> * /rmeta (default: xml)<br/> - * /rmeta/xml (store the content as xml)<br/> - * /rmeta/text (store the content as text)<br/> - * /rmeta/ignore (don't record any content)<br/> + * /rmeta/xml (store the content as xml)<br/> + * /rmeta/text (store the content as text)<br/> + * /rmeta/markdown (store the content as markdown)<br/> + * /rmeta/ignore (don't record any content)<br/> * * @param handlerTypeName which type of handler to use * @return InputStream that can be deserialized as a list of {@link Metadata} objects diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 5819ccb551..354331ce38 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -76,7 +76,7 @@ public class TikaResource { public static final String GREETING = "This is Tika Server (" + Tika.getString() + "). Please PUT\n"; /** * Header to specify the handler type for content extraction. - * Valid values: text, html, xml, ignore (default: text) + * Valid values: text, html, xml, markdown, ignore (default: text) */ public static final String HANDLER_TYPE_HEADER = "X-Tika-Handler"; private static final String META_PREFIX = "meta_"; @@ -549,6 +549,21 @@ public class TikaResource { return produceRawOutput(tis, Metadata.newInstance(context), httpHeaders.getRequestHeaders(), "xml"); } + /** + * Parse document and return Markdown content. + */ + @PUT + @Consumes("*/*") + @Produces("text/plain") + @Path("md") + public StreamingOutput getMarkdown(final InputStream is, @Context HttpHeaders httpHeaders) + throws IOException { + TikaInputStream tis = TikaInputStream.get(is); + tis.getPath(); // Spool to temp file for pipes-based parsing + ParseContext context = createParseContext(); + return produceRawOutput(tis, Metadata.newInstance(context), httpHeaders.getRequestHeaders(), "md"); + } + /** * Parse document and return JSON with metadata and text content. */ @@ -675,6 +690,28 @@ public class TikaResource { return produceRawOutput(tis, metadata, context, "xml"); } + /** + * Parse multipart document with optional config, return Markdown. + * <p> + * Accepts multipart with: + * - "file" part (required): the document to parse + * - "config" part (optional): JSON configuration for parser settings + * <p> + * This endpoint is gated behind enableUnsecureFeatures=true because per-request + * configuration could enable dangerous operations. + */ + @POST + @Consumes("multipart/form-data") + @Produces("text/plain") + @Path("config/md") + public StreamingOutput postMarkdown(List<Attachment> attachments, @Context HttpHeaders httpHeaders) + throws IOException { + ParseContext context = createParseContext(); + Metadata metadata = Metadata.newInstance(context); + TikaInputStream tis = setupMultipartConfig(attachments, metadata, context); + return produceRawOutput(tis, metadata, context, "md"); + } + /** * Parse multipart document with optional config, return JSON. * <p> @@ -702,7 +739,7 @@ public class TikaResource { // ==================== Internal methods ==================== /** - * Produces raw streaming output (text, html, xml) using pipes-based parsing. + * Produces raw streaming output (text, html, xml, md) using pipes-based parsing. */ private StreamingOutput produceRawOutput(TikaInputStream tis, Metadata metadata, MultivaluedMap<String, String> httpHeaders, diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java index 38e51de0c0..a03d8055fa 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java @@ -18,6 +18,7 @@ package org.apache.tika.server.standard; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -54,6 +55,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { private static final String FORM_PATH = "/form"; private static final String META_PATH = "/rmeta"; private static final String TEXT_PATH = "/text"; + private static final String MD_PATH = "/md"; private static final String IGNORE_PATH = "/ignore"; private static final String XML_PATH = "/xml"; private static final String UNPARSEABLE_PATH = "/somethingOrOther"; @@ -327,6 +329,23 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { .get(6) .get(TikaCoreProperties.TIKA_CONTENT)); + //markdown + response = WebClient + .create(endPoint + META_PATH + MD_PATH) + .accept("application/json") + .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC)); + reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); + metadataList = JsonMetadataList.fromJson(reader); + assertEquals(12, metadataList.size()); + content = metadataList + .get(6) + .get(TikaCoreProperties.TIKA_CONTENT) + .trim(); + // Markdown output should not contain HTML/XML tags + assertFalse(content.startsWith("<html")); + // Should contain the document text + assertContains("plundered our seas", content); + } @Test @@ -419,6 +438,25 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { assertNull(metadataList .get(6) .get(TikaCoreProperties.TIKA_CONTENT)); + + //markdown + attachmentPart = + new Attachment("myworddocx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC)); + webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + MD_PATH); + + response = webClient + .type("multipart/form-data") + .accept("application/json") + .post(attachmentPart); + reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); + metadataList = JsonMetadataList.fromJson(reader); + assertEquals(12, metadataList.size()); + content = metadataList + .get(6) + .get(TikaCoreProperties.TIKA_CONTENT) + .trim(); + assertFalse(content.startsWith("<html")); + assertContains("plundered our seas", content); } @Test diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java index 0b4fe94fb0..934bafc9e5 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java @@ -159,6 +159,20 @@ public class TikaResourceTest extends CXFTestBase { assertEquals(UNPROCESSEABLE, response.getStatus()); } + @Test + public void testSimpleWordMarkdown() throws Exception { + Response response = WebClient + .create(endPoint + TIKA_PATH + "/md") + .type("application/msword") + .put(ClassLoader.getSystemResourceAsStream(TEST_DOC)); + String responseMsg = getStringFromInputStream((InputStream) response.getEntity()); + assertTrue(responseMsg.contains("test")); + // Should not contain HTML/XML tags + assertFalse(responseMsg.contains("<html")); + assertFalse(responseMsg.contains("<body")); + assertFalse(responseMsg.contains("<p>")); + } + @Test public void testSimpleWordHTML() throws Exception { Response response = WebClient
