This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new b7e9ed562 TIKA-4530 -- don't let body content slip into headers in 
mbox (#2376)
b7e9ed562 is described below

commit b7e9ed56213ba0d56d608d909935998979128732
Author: Tim Allison <[email protected]>
AuthorDate: Mon Oct 27 13:38:05 2025 -0400

    TIKA-4530 -- don't let body content slip into headers in mbox (#2376)
    
    * TIKA-4530 -- don't let body content slip into headers
---
 .../org/apache/tika/parser/mbox/MboxParser.java     | 21 ++++++++++++++-------
 .../org/apache/tika/parser/mbox/MboxParserTest.java | 11 +++++++++++
 .../test/resources/test-documents/multiline2.mbox   |  7 +++++++
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
index dddd9bd92..f4834337b 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
@@ -49,6 +49,7 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.mailcommons.MailUtil;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
 
 /**
  * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses 
the
@@ -98,6 +99,7 @@ public class MboxParser implements Parser {
         try (BufferedReader reader = new BufferedReader(isr)) {
             String curLine = reader.readLine();
             int mailItem = 0;
+            boolean inHeader = true;
             do {
                 if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
                     Metadata mailMetadata = new Metadata();
@@ -111,16 +113,21 @@ public class MboxParser implements Parser {
                     if (curLine == null) {
                         break;
                     }
+
                     UnsynchronizedByteArrayOutputStream message = 
UnsynchronizedByteArrayOutputStream.builder().setBufferSize(100000).get();
                     do {
-                        if (curLine.startsWith(" ") || 
curLine.startsWith("\t")) {
-                            String latestLine = multiline.poll();
-                            latestLine += " " + curLine.trim();
-                            multiline.add(latestLine);
-                        } else {
-                            multiline.add(curLine);
+                        if (inHeader && StringUtils.isBlank(curLine)) {
+                            inHeader = false;
+                        }
+                        if (inHeader) {
+                            if (curLine.startsWith(" ") || 
curLine.startsWith("\t")) {
+                                String latestLine = multiline.poll();
+                                latestLine += " " + curLine.trim();
+                                multiline.add(latestLine);
+                            } else {
+                                multiline.add(curLine);
+                            }
                         }
-
                         message.write(curLine.getBytes(charsetName));
                         message.write(0x0A);
                         curLine = reader.readLine();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
index b019324fb..3a5ca8c2b 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.mbox;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 
 import java.io.InputStream;
 import java.util.List;
@@ -118,6 +119,16 @@ public class MboxParserTest extends TikaTest {
         assertEquals("from xxx by xxx with xxx; date", 
mailMetadata.get("MboxParser-received"));
     }
 
+    @Test
+    public void testMultilineHeader2() throws Exception {
+        //make sure that we aren't injecting body content into headers
+        for (Metadata m : getRecursiveMetadata("multiline2.mbox")) {
+            for (String mime : m.getValues(Metadata.CONTENT_TYPE)) {
+                assertFalse("something".equals(mime));
+            }
+        }
+    }
+
     @Test
     public void testQuoted() throws Exception {
         ContentHandler handler = new BodyContentHandler();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/multiline2.mbox
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/multiline2.mbox
new file mode 100644
index 000000000..144d0f238
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/multiline2.mbox
@@ -0,0 +1,7 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+Received: from xxx by xxx with xxx; date
+
+Test content
+    outside of header
+Content-type: something
+ or other
\ No newline at end of file

Reply via email to