This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4490 in repository https://gitbox.apache.org/repos/asf/tika.git
commit b545fd387fc3a49443b5daf811b7b6a7f576a8b4 Author: tallison <[email protected]> AuthorDate: Thu Oct 2 16:45:42 2025 -0400 TIKA-4490 -- move the check to the parser level --- .../org/apache/tika/parser/mail/MailContentHandler.java | 7 ------- .../main/java/org/apache/tika/parser/mail/RFC822Parser.java | 13 +++++++++++++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java index 69ec3f598..9af23d004 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java @@ -128,10 +128,6 @@ class MailContentHandler implements ContentHandler { if (!extractAllAlternatives && alternativePartBuffer.size() > 0) { UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get(); IOUtils.copy(is, bos); - byte[] bytes = bos.toByteArray(); - if (bytes.length == 0) { - return; - } alternativePartBuffer.peek().children.add(new BodyContents(submd, bos.toByteArray())); } else if (!extractAllAlternatives && parts.size() < 2) { //if you're at the first level of embedding @@ -141,9 +137,6 @@ class MailContentHandler implements ContentHandler { UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get(); IOUtils.copy(is, bos); final byte[] bytes = bos.toByteArray(); - if (bytes.length == 0) { - return; - } if (detectInlineTextOrHtml(submd, bytes)) { handleInlineBodyPart(new BodyContents(submd, bytes)); } else { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java index bd964db4a..b2b552122 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java @@ -31,6 +31,7 @@ import org.xml.sax.SAXException; import org.apache.tika.config.Field; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -95,6 +96,7 @@ public class RFC822Parser implements Parser { parser.setNoRecurse(); xhtml.startDocument(); TikaInputStream tstream = TikaInputStream.get(stream); + checkForZeroByte(tstream);//avoid stackoverflow try { parser.parse(tstream); } catch (IOException e) { @@ -114,6 +116,17 @@ public class RFC822Parser implements Parser { xhtml.endDocument(); } + private void checkForZeroByte(TikaInputStream tstream) throws IOException, ZeroByteFileException { + tstream.mark(1); + try { + if (tstream.read() < 0) { + throw new ZeroByteFileException("rfc822 parser found zero bytes"); + } + } finally { + tstream.reset(); + } + } + /** * Until version 1.17, Tika handled all body parts as embedded objects (see TIKA-2478). * In 1.17, we modified the parser to select only the best alternative body
