This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 1a3bd2d104 TIKA-4653 -- fix up extra whitespace
1a3bd2d104 is described below
commit 1a3bd2d10434fefbfc9976e90c7360a38b6448b6
Author: tallison <[email protected]>
AuthorDate: Tue Feb 10 14:59:35 2026 -0500
TIKA-4653 -- fix up extra whitespace
---
.../apache/tika/sax/ToMarkdownContentHandler.java | 20 ++++++-
.../tika/sax/ToMarkdownContentHandlerTest.java | 68 ++++++++++++++++++++++
2 files changed, 87 insertions(+), 1 deletion(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java
b/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java
index 34e5e96cef..c92ae55549 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java
@@ -81,6 +81,9 @@ public class ToMarkdownContentHandler extends DefaultHandler {
// Track if we've written any content at all
private boolean hasContent = false;
+ // Track if meaningful (non-whitespace) content was written since last
block separator
+ private boolean hasContentSinceLastSeparator = false;
+
public ToMarkdownContentHandler(Writer writer) {
this.writer = writer;
}
@@ -392,6 +395,14 @@ public class ToMarkdownContentHandler extends
DefaultHandler {
return;
}
+ // Skip whitespace-only text at line start; preserve inline spaces
+ if (text.trim().isEmpty()) {
+ if (!atLineStart) {
+ write(" ");
+ }
+ return;
+ }
+
// Escape markdown special characters in normal text
text = escapeMarkdown(text);
@@ -404,6 +415,7 @@ public class ToMarkdownContentHandler extends
DefaultHandler {
if (!text.isEmpty()) {
write(text);
hasContent = true;
+ hasContentSinceLastSeparator = true;
}
}
@@ -431,6 +443,9 @@ public class ToMarkdownContentHandler extends
DefaultHandler {
writer.write(s);
if (!s.isEmpty()) {
atLineStart = s.charAt(s.length() - 1) == '\n';
+ if (!s.trim().isEmpty()) {
+ hasContentSinceLastSeparator = true;
+ }
}
} catch (IOException e) {
throw new SAXException("Error writing: " + s, e);
@@ -438,10 +453,13 @@ public class ToMarkdownContentHandler extends
DefaultHandler {
}
private void emitBlockSeparator() throws SAXException {
- if (needsBlockSeparator && hasContent) {
+ if (needsBlockSeparator && hasContent && hasContentSinceLastSeparator)
{
write("\n\n");
needsBlockSeparator = false;
atLineStart = true;
+ hasContentSinceLastSeparator = false;
+ } else {
+ needsBlockSeparator = false;
}
}
diff --git
a/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java
b/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java
index 822962a96c..c1f75b32f1 100644
---
a/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java
+++
b/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java
@@ -629,6 +629,74 @@ public class ToMarkdownContentHandlerTest {
assertTrue(result.contains("Content in div\n\nAnother div"));
}
+ @Test
+ public void testNoExcessiveBlankLines() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ // Simulate SAX events with whitespace text nodes between elements,
+ // as typically produced by XHTML parsers
+ startElement(handler, "div");
+ chars(handler, "\n ");
+ startElement(handler, "p");
+ chars(handler, "First");
+ endElement(handler, "p");
+ chars(handler, "\n ");
+ endElement(handler, "div");
+
+ chars(handler, "\n ");
+
+ startElement(handler, "div");
+ chars(handler, "\n ");
+ endElement(handler, "div");
+
+ chars(handler, "\n ");
+
+ startElement(handler, "div");
+ chars(handler, "\n ");
+ startElement(handler, "p");
+ chars(handler, "Second");
+ endElement(handler, "p");
+ chars(handler, "\n ");
+ endElement(handler, "div");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ // Should not have more than one blank line (two consecutive newlines)
anywhere
+ assertFalse(result.contains("\n\n\n"),
+ "Output should not contain triple newlines: " + result);
+ assertContains("First", result);
+ assertContains("Second", result);
+ }
+
+ @Test
+ public void testInlineSpacesPreserved() throws Exception {
+ ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+ handler.startDocument();
+
+ startElement(handler, "p");
+ startElement(handler, "b");
+ chars(handler, "bold");
+ endElement(handler, "b");
+ chars(handler, " ");
+ startElement(handler, "i");
+ chars(handler, "italic");
+ endElement(handler, "i");
+ endElement(handler, "p");
+
+ handler.endDocument();
+
+ String result = handler.toString();
+ // Space between bold and italic should be preserved
+ assertTrue(result.contains("**bold** *italic*"));
+ }
+
+ private static void assertContains(String needle, String haystack) {
+ assertTrue(haystack.contains(needle),
+ "Expected to find '" + needle + "' in: " + haystack);
+ }
+
@Test
public void testHandlerTypeParsingMarkdown() {
assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,