This is an automated email from the ASF dual-hosted git repository. lfcnassif pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 02bf521ba11f22d5de636c7de41fe8643497246a Author: Luis Nassif <[email protected]> AuthorDate: Thu Nov 26 18:14:18 2020 -0300 TIKA-3004: Fix parsing of emails attached to other emails in PST files --- CHANGES.txt | 2 + .../parser/microsoft/pst/OutlookPSTParser.java | 57 +++++++++++++--------- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 6591f41..faf8913 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -16,6 +16,8 @@ Release 2.0.0 - ??? Release 1.26 - ??? * Great optimization in ForkParser (TIKA-3237). + + * Fix parsing of emails attached to other emails in PST files (TIKA-3004). Release 1.25 - 11/25/2020 diff --git a/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java b/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java index e90077d..360e4a2 100644 --- a/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java +++ b/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java @@ -25,12 +25,6 @@ import java.io.IOException; import java.io.InputStream; import java.util.Set; -import com.pff.PSTAttachment; -import com.pff.PSTException; -import com.pff.PSTFile; -import com.pff.PSTFolder; -import com.pff.PSTMessage; -import com.pff.PSTRecipient; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; @@ -48,6 +42,13 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; +import com.pff.PSTAttachment; +import com.pff.PSTException; +import com.pff.PSTFile; +import com.pff.PSTFolder; +import com.pff.PSTMessage; +import com.pff.PSTRecipient; + /** * Parser for MS Outlook PST email storage files */ @@ -115,23 +116,7 @@ public class OutlookPSTParser extends AbstractParser { if (pstFolder.getContentCount() > 0) { PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild(); while (pstMail != null) { - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "embedded"); - attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId()); - handler.startElement("div", attributes); - handler.element("h1", pstMail.getSubject()); - - final Metadata mailMetadata = new Metadata(); - //parse attachments first so that stream exceptions - //in attachments can make it into mailMetadata. - //RecursiveParserWrapper copies the metadata and thereby prevents - //modifications to mailMetadata from making it into the - //metadata objects cached by the RecursiveParserWrapper - parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor); - parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor); - - handler.endElement("div"); - + parseMailAndAttachments(handler, pstMail, embeddedExtractor); pstMail = (PSTMessage) pstFolder.getNextChild(); } } @@ -146,6 +131,26 @@ public class OutlookPSTParser extends AbstractParser { } } + private void parseMailAndAttachments(XHTMLContentHandler handler, PSTMessage pstMail, + EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException, TikaException { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId()); + handler.startElement("div", attributes); + handler.element("h1", pstMail.getSubject()); + + final Metadata mailMetadata = new Metadata(); + // parse attachments first so that stream exceptions + // in attachments can make it into mailMetadata. + // RecursiveParserWrapper copies the metadata and thereby prevents + // modifications to mailMetadata from making it into the + // metadata objects cached by the RecursiveParserWrapper + parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor); + parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor); + + handler.endElement("div"); + } + private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, Metadata mailMetadata, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException { mailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getInternetMessageId()); @@ -231,6 +236,12 @@ public class OutlookPSTParser extends AbstractParser { try { PSTAttachment attach = email.getAttachment(i); + PSTMessage attachedEmail = attach.getEmbeddedPSTMessage(); + if (attachedEmail != null) { + parseMailAndAttachments(xhtml, attachedEmail, embeddedExtractor); + continue; + } + // Get the filename; both long and short filenames can be used for attachments String filename = attach.getLongFilename(); if (filename.isEmpty()) {
