This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 282eb64afe TIKA-4653 -- fix up extra whitespace (#2602)
282eb64afe is described below

commit 282eb64afefcfeff15451348448cf8bc7fe59080
Author: Tim Allison <[email protected]>
AuthorDate: Tue Feb 10 15:31:52 2026 -0500

    TIKA-4653 -- fix up extra whitespace (#2602)
---
 .../apache/tika/sax/ToMarkdownContentHandler.java  | 20 ++++++-
 .../tika/sax/ToMarkdownContentHandlerTest.java     | 68 ++++++++++++++++++++++
 2 files changed, 87 insertions(+), 1 deletion(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java 
b/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java
index 34e5e96cef..c92ae55549 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ToMarkdownContentHandler.java
@@ -81,6 +81,9 @@ public class ToMarkdownContentHandler extends DefaultHandler {
     // Track if we've written any content at all
     private boolean hasContent = false;
 
+    // Track if meaningful (non-whitespace) content was written since last 
block separator
+    private boolean hasContentSinceLastSeparator = false;
+
     public ToMarkdownContentHandler(Writer writer) {
         this.writer = writer;
     }
@@ -392,6 +395,14 @@ public class ToMarkdownContentHandler extends 
DefaultHandler {
             return;
         }
 
+        // Skip whitespace-only text at line start; preserve inline spaces
+        if (text.trim().isEmpty()) {
+            if (!atLineStart) {
+                write(" ");
+            }
+            return;
+        }
+
         // Escape markdown special characters in normal text
         text = escapeMarkdown(text);
 
@@ -404,6 +415,7 @@ public class ToMarkdownContentHandler extends 
DefaultHandler {
         if (!text.isEmpty()) {
             write(text);
             hasContent = true;
+            hasContentSinceLastSeparator = true;
         }
     }
 
@@ -431,6 +443,9 @@ public class ToMarkdownContentHandler extends 
DefaultHandler {
             writer.write(s);
             if (!s.isEmpty()) {
                 atLineStart = s.charAt(s.length() - 1) == '\n';
+                if (!s.trim().isEmpty()) {
+                    hasContentSinceLastSeparator = true;
+                }
             }
         } catch (IOException e) {
             throw new SAXException("Error writing: " + s, e);
@@ -438,10 +453,13 @@ public class ToMarkdownContentHandler extends 
DefaultHandler {
     }
 
     private void emitBlockSeparator() throws SAXException {
-        if (needsBlockSeparator && hasContent) {
+        if (needsBlockSeparator && hasContent && hasContentSinceLastSeparator) 
{
             write("\n\n");
             needsBlockSeparator = false;
             atLineStart = true;
+            hasContentSinceLastSeparator = false;
+        } else {
+            needsBlockSeparator = false;
         }
     }
 
diff --git 
a/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java 
b/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java
index 1ba3523a23..298d11c70f 100644
--- 
a/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java
+++ 
b/tika-core/src/test/java/org/apache/tika/sax/ToMarkdownContentHandlerTest.java
@@ -629,6 +629,74 @@ public class ToMarkdownContentHandlerTest {
         assertTrue(result.contains("Content in div\n\nAnother div"));
     }
 
+    @Test
+    public void testNoExcessiveBlankLines() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        // Simulate SAX events with whitespace text nodes between elements,
+        // as typically produced by XHTML parsers
+        startElement(handler, "div");
+        chars(handler, "\n  ");
+        startElement(handler, "p");
+        chars(handler, "First");
+        endElement(handler, "p");
+        chars(handler, "\n  ");
+        endElement(handler, "div");
+
+        chars(handler, "\n  ");
+
+        startElement(handler, "div");
+        chars(handler, "\n  ");
+        endElement(handler, "div");
+
+        chars(handler, "\n  ");
+
+        startElement(handler, "div");
+        chars(handler, "\n  ");
+        startElement(handler, "p");
+        chars(handler, "Second");
+        endElement(handler, "p");
+        chars(handler, "\n  ");
+        endElement(handler, "div");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        // Should not have more than one blank line (two consecutive newlines) 
anywhere
+        assertFalse(result.contains("\n\n\n"),
+                "Output should not contain triple newlines: " + result);
+        assertContains("First", result);
+        assertContains("Second", result);
+    }
+
+    @Test
+    public void testInlineSpacesPreserved() throws Exception {
+        ToMarkdownContentHandler handler = new ToMarkdownContentHandler();
+        handler.startDocument();
+
+        startElement(handler, "p");
+        startElement(handler, "b");
+        chars(handler, "bold");
+        endElement(handler, "b");
+        chars(handler, " ");
+        startElement(handler, "i");
+        chars(handler, "italic");
+        endElement(handler, "i");
+        endElement(handler, "p");
+
+        handler.endDocument();
+
+        String result = handler.toString();
+        // Space between bold and italic should be preserved
+        assertTrue(result.contains("**bold** *italic*"));
+    }
+
+    private static void assertContains(String needle, String haystack) {
+        assertTrue(haystack.contains(needle),
+                "Expected to find '" + needle + "' in: " + haystack);
+    }
+
     @Test
     public void testHandlerTypeParsingMarkdown() {
         assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,

Reply via email to