This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push: new 6829643 TIKA-2547: RFC822 with multipart/mixed, first text element should be treated as the main body of the email, not an attachment. 6829643 is described below commit 68296437b23052ebc4415a9cec9aadc14141f634 Author: tballison <talli...@mitre.org> AuthorDate: Wed Jan 31 13:33:08 2018 -0500 TIKA-2547: RFC822 with multipart/mixed, first text element should be treated as the main body of the email, not an attachment. --- CHANGES.txt | 8 ++++ .../tika/parser/mail/MailContentHandler.java | 52 ++++++++++++++++++---- .../apache/tika/parser/mail/RFC822ParserTest.java | 19 ++++++++ .../resources/test-documents/testRFC822-txt-body | 35 +++++++++++++++ 4 files changed, 106 insertions(+), 8 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 1399520..b1ca828 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,7 +1,15 @@ Release 1.18 - ??? + + * RFC822 with multipart/mixed, first text element should be treated + as the main body of the email, not an attachment (TIKA-2547). + * Swap out com.tdunning:json for com.github.openjson:openjson to avoid jar conflicts (TIKA-2556). + * No longer hardcode HtmlParser for XML files in tika-server (TIKA-2551). + + * Require Java 8 (TIKA-2553). + * Add a parser for XPS (TIKA-2524). * Mime magic for Dolby Digital AC3 and EAC3 files diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java index 40db8f3..ddc32b8 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java @@ -33,6 +33,7 @@ import org.apache.james.mime4j.message.MaximalBodyDescriptor; import org.apache.james.mime4j.parser.ContentHandler; import org.apache.james.mime4j.stream.BodyDescriptor; import org.apache.james.mime4j.stream.Field; +import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; @@ -147,6 +148,7 @@ class MailContentHandler implements ContentHandler { private boolean strictParsing = false; private final boolean extractAllAlternatives; private final EmbeddedDocumentExtractor extractor; + private final Detector detector; //this is used to buffer a multipart body that //keeps track of multipart/alternative and its children @@ -167,6 +169,7 @@ class MailContentHandler implements ContentHandler { // Was an EmbeddedDocumentExtractor explicitly supplied? this.extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + this.detector = new EmbeddedDocumentUtil(context).getDetector(); } @Override @@ -184,16 +187,16 @@ class MailContentHandler implements ContentHandler { if (parts.size() > 0) { submd.set(Message.MULTIPART_SUBTYPE, parts.peek().getSubType()); submd.set(Message.MULTIPART_BOUNDARY, parts.peek().getBoundary()); - } + } if (body instanceof MaximalBodyDescriptor) { MaximalBodyDescriptor maximalBody = (MaximalBodyDescriptor) body; String contentDispositionType = maximalBody.getContentDispositionType(); if (contentDispositionType != null && !contentDispositionType.isEmpty()) { - StringBuilder contentDisposition = new StringBuilder( contentDispositionType ); + StringBuilder contentDisposition = new StringBuilder(contentDispositionType); Map<String, String> contentDispositionParameters = maximalBody.getContentDispositionParameters(); - for ( Entry<String, String> param : contentDispositionParameters.entrySet() ) { + for (Entry<String, String> param : contentDispositionParameters.entrySet()) { contentDisposition.append("; ") - .append(param.getKey()).append("=\"").append(param.getValue()).append('"'); + .append(param.getKey()).append("=\"").append(param.getValue()).append('"'); } String contentDispositionFileName = maximalBody.getContentDispositionFilename(); @@ -201,15 +204,31 @@ class MailContentHandler implements ContentHandler { submd.set( Metadata.RESOURCE_NAME_KEY, contentDispositionFileName ); } - submd.set( Metadata.CONTENT_DISPOSITION, contentDisposition.toString() ); + submd.set(Metadata.CONTENT_DISPOSITION, contentDisposition.toString()); } } //if we're in a multipart/alternative or any one of its children //add the bodypart to the latest that was added - if (! extractAllAlternatives && alternativePartBuffer.size() > 0) { + if (!extractAllAlternatives && alternativePartBuffer.size() > 0) { ByteArrayOutputStream bos = new ByteArrayOutputStream(); IOUtils.copy(is, bos); alternativePartBuffer.peek().children.add(new BodyContents(submd, bos.toByteArray())); + } else if (!extractAllAlternatives && parts.size() == 1) { + //if you're at the first level of embedding + //and you're not in an alternative part block + //and you're text/html, put that in the body of the email + //otherwise treat as a regular attachment + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + IOUtils.copy(is, bos); + byte[] bytes = bos.toByteArray(); + if (isTextOrHtml(submd, bytes)) { + handleInlineBodyPart(new BodyContents(submd, bos.toByteArray())); + } else { + //else handle as you would any other embedded content + try (TikaInputStream tis = TikaInputStream.get(bytes)) { + handleEmbedded(tis, submd); + } + } } else { //else handle as you would any other embedded content try (TikaInputStream tis = TikaInputStream.get(is)) { @@ -218,6 +237,22 @@ class MailContentHandler implements ContentHandler { } } + private boolean isTextOrHtml(Metadata submd, byte[] bytes) { + String mediaTypeString = submd.get(Metadata.CONTENT_TYPE); + if (mediaTypeString != null && mediaTypeString.startsWith("text")) { + return true; + } + try (TikaInputStream tis = TikaInputStream.get(bytes)) { + MediaType mediaType = detector.detect(tis, submd); + if (mediaType != null && mediaType.toString().startsWith("text")) { + return true; + } + } catch (IOException e) { + + } + return false; + } + private void handleEmbedded(TikaInputStream tis, Metadata metadata) throws MimeException, IOException { String disposition = metadata.get(Metadata.CONTENT_DISPOSITION); @@ -516,7 +551,7 @@ class MailContentHandler implements ContentHandler { } if (part instanceof BodyContents) { - handlePart((BodyContents)part); + handleInlineBodyPart((BodyContents)part); return; } @@ -539,7 +574,7 @@ class MailContentHandler implements ContentHandler { } } - private void handlePart(BodyContents part) throws MimeException, IOException { + private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException { String contentType = part.metadata.get(Metadata.CONTENT_TYPE); Parser parser = null; if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) { @@ -555,6 +590,7 @@ class MailContentHandler implements ContentHandler { if (parser == null) { + //back off and treat it as an embedded chunk try (TikaInputStream tis = TikaInputStream.get(part.bytes)) { handleEmbedded(tis, part.metadata); } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java index 7b48f13..0e8c237 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java @@ -260,6 +260,25 @@ public class RFC822ParserTest extends TikaTest { metadata.get(Metadata.SUBJECT)); } + @Test + public void testMainBody() throws Exception { + //test that the first text or html chunk is processed in the main body + //not treated as an attachment. TIKA-2547 + List<Metadata> metadataList = getRecursiveMetadata("testRFC822_oddfrom"); + assertEquals(7, metadataList.size()); + assertContains("Air Quality Planning", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + + //Make sure text alternative doesn't get treated as an attachment + metadataList = getRecursiveMetadata("testRFC822_normal_zip"); + assertEquals(3, metadataList.size()); + assertContains("This is the HTML part", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals("application/zip", metadataList.get(2).get(Metadata.CONTENT_TYPE)); + + metadataList = getRecursiveMetadata("testRFC822-txt-body"); + assertEquals(2, metadataList.size()); + assertContains("body 1", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + } + /** * Test for TIKA-640, increase header max beyond 10k bytes */ diff --git a/tika-parsers/src/test/resources/test-documents/testRFC822-txt-body b/tika-parsers/src/test/resources/test-documents/testRFC822-txt-body new file mode 100644 index 0000000..de28397 --- /dev/null +++ b/tika-parsers/src/test/resources/test-documents/testRFC822-txt-body @@ -0,0 +1,35 @@ +MIME-Version: 1.0 +Received: by 10.103.33.199 with HTTP; Tue, 6 Jun 2017 14:48:27 -0700 (PDT) +Bcc: emailtosalesfo...@r-kub1lq8760pccrdt39x94qxtajhk3q4zb1fzikf15ygnugofn.6a-euhkuaa.na50.le.salesforce.com +Date: Tue, 6 Jun 2017 14:48:27 -0700 +Delivered-To: john....@gmail.com +Message-ID: <CACmaLAZ16kghp1Qf99noL2P33AnAzU7bbKfju=jasryyscv...@mail.gmail.com> +Subject: Test BCCing email (rev 2) +From: John Doe <john....@gmail.com> +To: john.sm...@domain.com +Content-Type: multipart/mixed; boundary="94eb2c03266668996305515194b6" + +This is a multipart message in MIME format. + +--94eb2c03266668996305515194b6 +Content-Type: text/plain; charset="UTF-8" +Content-Transfer-Encoding: quoted-printable + +This is an email that will have some rich text and an attachment. + +*Because I've added some bold text here.* + +body 1 +*=E2=80=8B* +*And here's some more text (still bold)* + +-- John + +--94eb2c03266668996305515194b6 +Content-Type: image/jpeg; name="mary-coffee.jpg" +Content-Disposition: attachment; filename="mary-coffee.jpg" +Content-Transfer-Encoding: base64 +X-Attachment-Id: f_j3m3jfpq1 + + +--94eb2c03266668996305515194b6-- \ No newline at end of file -- To stop receiving notification emails like this one, please contact talli...@apache.org.