This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 3f0479aee TIKA-4530 -- don't let body content slip into headers in
mbox (#2376)
3f0479aee is described below
commit 3f0479aeed4178d16aec71799b7ef739a11958fc
Author: Tim Allison <[email protected]>
AuthorDate: Mon Oct 27 13:38:05 2025 -0400
TIKA-4530 -- don't let body content slip into headers in mbox (#2376)
* TIKA-4530 -- don't let body content slip into headers
(cherry picked from commit b7e9ed56213ba0d56d608d909935998979128732)
---
.../org/apache/tika/parser/mbox/MboxParser.java | 21 ++++++++++++++-------
.../org/apache/tika/parser/mbox/MboxParserTest.java | 11 +++++++++++
.../test/resources/test-documents/multiline2.mbox | 7 +++++++
3 files changed, 32 insertions(+), 7 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
index 4c7bea74c..23548bcae 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
@@ -48,6 +48,7 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.mailcommons.MailUtil;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
/**
* Mbox (mailbox) parser. This version extracts each mail from Mbox and uses
the
@@ -97,6 +98,7 @@ public class MboxParser implements Parser {
try (BufferedReader reader = new BufferedReader(isr)) {
String curLine = reader.readLine();
int mailItem = 0;
+ boolean inHeader = true;
do {
if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
Metadata mailMetadata = new Metadata();
@@ -110,16 +112,21 @@ public class MboxParser implements Parser {
if (curLine == null) {
break;
}
+
UnsynchronizedByteArrayOutputStream message =
UnsynchronizedByteArrayOutputStream.builder().setBufferSize(100000).get();
do {
- if (curLine.startsWith(" ") ||
curLine.startsWith("\t")) {
- String latestLine = multiline.poll();
- latestLine += " " + curLine.trim();
- multiline.add(latestLine);
- } else {
- multiline.add(curLine);
+ if (inHeader && StringUtils.isBlank(curLine)) {
+ inHeader = false;
+ }
+ if (inHeader) {
+ if (curLine.startsWith(" ") ||
curLine.startsWith("\t")) {
+ String latestLine = multiline.poll();
+ latestLine += " " + curLine.trim();
+ multiline.add(latestLine);
+ } else {
+ multiline.add(curLine);
+ }
}
-
message.write(curLine.getBytes(charsetName));
message.write(0x0A);
curLine = reader.readLine();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
index b019324fb..3a5ca8c2b 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
@@ -17,6 +17,7 @@
package org.apache.tika.parser.mbox;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import java.io.InputStream;
import java.util.List;
@@ -118,6 +119,16 @@ public class MboxParserTest extends TikaTest {
assertEquals("from xxx by xxx with xxx; date",
mailMetadata.get("MboxParser-received"));
}
+ @Test
+ public void testMultilineHeader2() throws Exception {
+ //make sure that we aren't injecting body content into headers
+ for (Metadata m : getRecursiveMetadata("multiline2.mbox")) {
+ for (String mime : m.getValues(Metadata.CONTENT_TYPE)) {
+ assertFalse("something".equals(mime));
+ }
+ }
+ }
+
@Test
public void testQuoted() throws Exception {
ContentHandler handler = new BodyContentHandler();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/multiline2.mbox
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/multiline2.mbox
new file mode 100644
index 000000000..144d0f238
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/multiline2.mbox
@@ -0,0 +1,7 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+Received: from xxx by xxx with xxx; date
+
+Test content
+ outside of header
+Content-type: something
+ or other
\ No newline at end of file