This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 5b2b17de9 TIKA-4490 (#2350)
5b2b17de9 is described below
commit 5b2b17de9ec47602425d7dcc8d40d1605cfe5ba1
Author: Tim Allison <[email protected]>
AuthorDate: Thu Oct 2 18:10:31 2025 -0400
TIKA-4490 (#2350)
* TIKA-4490 -- move the check to the parser level
---
.../org/apache/tika/parser/mail/MailContentHandler.java | 7 -------
.../main/java/org/apache/tika/parser/mail/RFC822Parser.java | 13 +++++++++++++
.../src/test/java/org/apache/tika/ossfuzz/ParserFuzzer.java | 1 -
3 files changed, 13 insertions(+), 8 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 69ec3f598..9af23d004 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -128,10 +128,6 @@ class MailContentHandler implements ContentHandler {
if (!extractAllAlternatives && alternativePartBuffer.size() > 0) {
UnsynchronizedByteArrayOutputStream bos =
UnsynchronizedByteArrayOutputStream.builder().get();
IOUtils.copy(is, bos);
- byte[] bytes = bos.toByteArray();
- if (bytes.length == 0) {
- return;
- }
alternativePartBuffer.peek().children.add(new BodyContents(submd,
bos.toByteArray()));
} else if (!extractAllAlternatives && parts.size() < 2) {
//if you're at the first level of embedding
@@ -141,9 +137,6 @@ class MailContentHandler implements ContentHandler {
UnsynchronizedByteArrayOutputStream bos =
UnsynchronizedByteArrayOutputStream.builder().get();
IOUtils.copy(is, bos);
final byte[] bytes = bos.toByteArray();
- if (bytes.length == 0) {
- return;
- }
if (detectInlineTextOrHtml(submd, bytes)) {
handleInlineBodyPart(new BodyContents(submd, bytes));
} else {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
index bd964db4a..b2b552122 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
@@ -31,6 +31,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.config.Field;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -95,6 +96,7 @@ public class RFC822Parser implements Parser {
parser.setNoRecurse();
xhtml.startDocument();
TikaInputStream tstream = TikaInputStream.get(stream);
+ checkForZeroByte(tstream);//avoid stackoverflow
try {
parser.parse(tstream);
} catch (IOException e) {
@@ -114,6 +116,17 @@ public class RFC822Parser implements Parser {
xhtml.endDocument();
}
+ private void checkForZeroByte(TikaInputStream tstream) throws IOException,
ZeroByteFileException {
+ tstream.mark(1);
+ try {
+ if (tstream.read() < 0) {
+ throw new ZeroByteFileException("rfc822 parser found zero
bytes");
+ }
+ } finally {
+ tstream.reset();
+ }
+ }
+
/**
* Until version 1.17, Tika handled all body parts as embedded objects
(see TIKA-2478).
* In 1.17, we modified the parser to select only the best alternative body
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/ossfuzz/ParserFuzzer.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/ossfuzz/ParserFuzzer.java
index d13a4748f..129b964d6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/ossfuzz/ParserFuzzer.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/ossfuzz/ParserFuzzer.java
@@ -27,7 +27,6 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
-
import org.apache.tika.sax.ToTextContentHandler;