This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4530 in repository https://gitbox.apache.org/repos/asf/tika.git
commit af47456dc5a4940d182c0cad2d1b4a20cb4c422a Author: tallison <[email protected]> AuthorDate: Mon Oct 27 12:22:58 2025 -0400 TIKA-4530 -- don't let body content slip into headers --- .../org/apache/tika/parser/mbox/MboxParser.java | 24 +++++++++++++++------- .../apache/tika/parser/mbox/MboxParserTest.java | 13 ++++++++++++ .../test/resources/test-documents/multiline2.mbox | 7 +++++++ 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java index dddd9bd92..d73626eef 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java @@ -49,6 +49,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.mailcommons.MailUtil; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.StringUtils; /** * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses the @@ -98,7 +99,11 @@ public class MboxParser implements Parser { try (BufferedReader reader = new BufferedReader(isr)) { String curLine = reader.readLine(); int mailItem = 0; + boolean inHeader = true; do { + if (curLine.contains("1495533845574511907")) { + System.out.println("here"); + } if (curLine.startsWith(MBOX_RECORD_DIVIDER)) { Metadata mailMetadata = new Metadata(); Queue<String> multiline = new LinkedList<>(); @@ -111,16 +116,21 @@ public class MboxParser implements Parser { if (curLine == null) { break; } + UnsynchronizedByteArrayOutputStream message = UnsynchronizedByteArrayOutputStream.builder().setBufferSize(100000).get(); do { - if (curLine.startsWith(" ") || curLine.startsWith("\t")) { - String latestLine = multiline.poll(); - latestLine += " " + curLine.trim(); - multiline.add(latestLine); - } else { - multiline.add(curLine); + if (inHeader && StringUtils.isBlank(curLine)) { + inHeader = false; + } + if (inHeader) { + if (curLine.startsWith(" ") || curLine.startsWith("\t")) { + String latestLine = multiline.poll(); + latestLine += " " + curLine.trim(); + multiline.add(latestLine); + } else { + multiline.add(curLine); + } } - message.write(curLine.getBytes(charsetName)); message.write(0x0A); curLine = reader.readLine(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java index b019324fb..8591a2e17 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java @@ -17,8 +17,11 @@ package org.apache.tika.parser.mbox; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import java.io.InputStream; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.List; import java.util.Map; @@ -118,6 +121,16 @@ public class MboxParserTest extends TikaTest { assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received")); } + @Test + public void testMultilineHeader2() throws Exception { + //make sure that we aren't injecting body content into headers + for (Metadata m : getRecursiveMetadata("multiline2.mbox")) { + for (String mime : m.getValues(Metadata.CONTENT_TYPE)) { + assertFalse("something".equals(mime)); + } + } + } + @Test public void testQuoted() throws Exception { ContentHandler handler = new BodyContentHandler(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/multiline2.mbox b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/multiline2.mbox new file mode 100644 index 000000000..144d0f238 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/multiline2.mbox @@ -0,0 +1,7 @@ +From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009 +Received: from xxx by xxx with xxx; date + +Test content + outside of header +Content-type: something + or other \ No newline at end of file
