This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4530
in repository https://gitbox.apache.org/repos/asf/tika.git

commit af47456dc5a4940d182c0cad2d1b4a20cb4c422a
Author: tallison <[email protected]>
AuthorDate: Mon Oct 27 12:22:58 2025 -0400

    TIKA-4530 -- don't let body content slip into headers
---
 .../org/apache/tika/parser/mbox/MboxParser.java    | 24 +++++++++++++++-------
 .../apache/tika/parser/mbox/MboxParserTest.java    | 13 ++++++++++++
 .../test/resources/test-documents/multiline2.mbox  |  7 +++++++
 3 files changed, 37 insertions(+), 7 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
index dddd9bd92..d73626eef 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
@@ -49,6 +49,7 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.mailcommons.MailUtil;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
 
 /**
  * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses 
the
@@ -98,7 +99,11 @@ public class MboxParser implements Parser {
         try (BufferedReader reader = new BufferedReader(isr)) {
             String curLine = reader.readLine();
             int mailItem = 0;
+            boolean inHeader = true;
             do {
+                if (curLine.contains("1495533845574511907")) {
+                    System.out.println("here");
+                }
                 if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
                     Metadata mailMetadata = new Metadata();
                     Queue<String> multiline = new LinkedList<>();
@@ -111,16 +116,21 @@ public class MboxParser implements Parser {
                     if (curLine == null) {
                         break;
                     }
+
                     UnsynchronizedByteArrayOutputStream message = 
UnsynchronizedByteArrayOutputStream.builder().setBufferSize(100000).get();
                     do {
-                        if (curLine.startsWith(" ") || 
curLine.startsWith("\t")) {
-                            String latestLine = multiline.poll();
-                            latestLine += " " + curLine.trim();
-                            multiline.add(latestLine);
-                        } else {
-                            multiline.add(curLine);
+                        if (inHeader && StringUtils.isBlank(curLine)) {
+                            inHeader = false;
+                        }
+                        if (inHeader) {
+                            if (curLine.startsWith(" ") || 
curLine.startsWith("\t")) {
+                                String latestLine = multiline.poll();
+                                latestLine += " " + curLine.trim();
+                                multiline.add(latestLine);
+                            } else {
+                                multiline.add(curLine);
+                            }
                         }
-
                         message.write(curLine.getBytes(charsetName));
                         message.write(0x0A);
                         curLine = reader.readLine();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
index b019324fb..8591a2e17 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
@@ -17,8 +17,11 @@
 package org.apache.tika.parser.mbox;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 
 import java.io.InputStream;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.List;
 import java.util.Map;
 
@@ -118,6 +121,16 @@ public class MboxParserTest extends TikaTest {
         assertEquals("from xxx by xxx with xxx; date", 
mailMetadata.get("MboxParser-received"));
     }
 
+    @Test
+    public void testMultilineHeader2() throws Exception {
+        //make sure that we aren't injecting body content into headers
+        for (Metadata m : getRecursiveMetadata("multiline2.mbox")) {
+            for (String mime : m.getValues(Metadata.CONTENT_TYPE)) {
+                assertFalse("something".equals(mime));
+            }
+        }
+    }
+
     @Test
     public void testQuoted() throws Exception {
         ContentHandler handler = new BodyContentHandler();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/multiline2.mbox
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/multiline2.mbox
new file mode 100644
index 000000000..144d0f238
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/multiline2.mbox
@@ -0,0 +1,7 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+Received: from xxx by xxx with xxx; date
+
+Test content
+    outside of header
+Content-type: something
+ or other
\ No newline at end of file

Reply via email to