This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 6829643   TIKA-2547: RFC822 with multipart/mixed, first text element 
should be treated as the main body of the email, not an attachment.
6829643 is described below

commit 68296437b23052ebc4415a9cec9aadc14141f634
Author: tballison <talli...@mitre.org>
AuthorDate: Wed Jan 31 13:33:08 2018 -0500

     TIKA-2547: RFC822 with multipart/mixed, first text element should be 
treated as the main body of the email, not an attachment.
---
 CHANGES.txt                                        |  8 ++++
 .../tika/parser/mail/MailContentHandler.java       | 52 ++++++++++++++++++----
 .../apache/tika/parser/mail/RFC822ParserTest.java  | 19 ++++++++
 .../resources/test-documents/testRFC822-txt-body   | 35 +++++++++++++++
 4 files changed, 106 insertions(+), 8 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 1399520..b1ca828 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,7 +1,15 @@
 Release 1.18 - ???
+
+   * RFC822 with multipart/mixed, first text element should be treated
+     as the main body of the email, not an attachment (TIKA-2547).
+
    * Swap out com.tdunning:json for com.github.openjson:openjson to avoid
      jar conflicts (TIKA-2556).
 
+   * No longer hardcode HtmlParser for XML files in tika-server (TIKA-2551).
+
+   * Require Java 8 (TIKA-2553).
+
    * Add a parser for XPS (TIKA-2524).
 
    * Mime magic for Dolby Digital AC3 and EAC3 files
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 40db8f3..ddc32b8 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -33,6 +33,7 @@ import org.apache.james.mime4j.message.MaximalBodyDescriptor;
 import org.apache.james.mime4j.parser.ContentHandler;
 import org.apache.james.mime4j.stream.BodyDescriptor;
 import org.apache.james.mime4j.stream.Field;
+import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -147,6 +148,7 @@ class MailContentHandler implements ContentHandler {
     private boolean strictParsing = false;
     private final boolean extractAllAlternatives;
     private final EmbeddedDocumentExtractor extractor;
+    private final Detector detector;
 
     //this is used to buffer a multipart body that
     //keeps track of multipart/alternative and its children
@@ -167,6 +169,7 @@ class MailContentHandler implements ContentHandler {
 
         // Was an EmbeddedDocumentExtractor explicitly supplied?
         this.extractor = 
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+        this.detector = new EmbeddedDocumentUtil(context).getDetector();
     }
 
     @Override
@@ -184,16 +187,16 @@ class MailContentHandler implements ContentHandler {
         if (parts.size() > 0) {
             submd.set(Message.MULTIPART_SUBTYPE, parts.peek().getSubType());
             submd.set(Message.MULTIPART_BOUNDARY, parts.peek().getBoundary());
-        }   
+        }
         if (body instanceof MaximalBodyDescriptor) {
             MaximalBodyDescriptor maximalBody = (MaximalBodyDescriptor) body;
             String contentDispositionType = 
maximalBody.getContentDispositionType();
             if (contentDispositionType != null && 
!contentDispositionType.isEmpty()) {
-                StringBuilder contentDisposition = new StringBuilder( 
contentDispositionType );
+                StringBuilder contentDisposition = new 
StringBuilder(contentDispositionType);
                 Map<String, String> contentDispositionParameters = 
maximalBody.getContentDispositionParameters();
-                for ( Entry<String, String> param : 
contentDispositionParameters.entrySet() ) {
+                for (Entry<String, String> param : 
contentDispositionParameters.entrySet()) {
                     contentDisposition.append("; ")
-                                      
.append(param.getKey()).append("=\"").append(param.getValue()).append('"');
+                            
.append(param.getKey()).append("=\"").append(param.getValue()).append('"');
                 }
 
                 String contentDispositionFileName = 
maximalBody.getContentDispositionFilename();
@@ -201,15 +204,31 @@ class MailContentHandler implements ContentHandler {
                     submd.set( Metadata.RESOURCE_NAME_KEY, 
contentDispositionFileName );
                 }
 
-                submd.set( Metadata.CONTENT_DISPOSITION, 
contentDisposition.toString() );
+                submd.set(Metadata.CONTENT_DISPOSITION, 
contentDisposition.toString());
             }
         }
         //if we're in a multipart/alternative or any one of its children
         //add the bodypart to the latest that was added
-        if (! extractAllAlternatives && alternativePartBuffer.size() > 0) {
+        if (!extractAllAlternatives && alternativePartBuffer.size() > 0) {
             ByteArrayOutputStream bos = new ByteArrayOutputStream();
             IOUtils.copy(is, bos);
             alternativePartBuffer.peek().children.add(new BodyContents(submd, 
bos.toByteArray()));
+        } else if (!extractAllAlternatives && parts.size() == 1) {
+            //if you're at the first level of embedding
+            //and you're not in an alternative part block
+            //and you're text/html, put that in the body of the email
+            //otherwise treat as a regular attachment
+            ByteArrayOutputStream bos = new ByteArrayOutputStream();
+            IOUtils.copy(is, bos);
+            byte[] bytes = bos.toByteArray();
+            if (isTextOrHtml(submd, bytes)) {
+                handleInlineBodyPart(new BodyContents(submd, 
bos.toByteArray()));
+            } else {
+                //else handle as you would any other embedded content
+                try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+                    handleEmbedded(tis, submd);
+                }
+            }
         } else {
             //else handle as you would any other embedded content
             try (TikaInputStream tis = TikaInputStream.get(is)) {
@@ -218,6 +237,22 @@ class MailContentHandler implements ContentHandler {
         }
     }
 
+    private boolean isTextOrHtml(Metadata submd, byte[] bytes) {
+        String mediaTypeString = submd.get(Metadata.CONTENT_TYPE);
+        if (mediaTypeString != null && mediaTypeString.startsWith("text")) {
+            return true;
+        }
+        try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+            MediaType mediaType = detector.detect(tis, submd);
+            if (mediaType != null && mediaType.toString().startsWith("text")) {
+                return true;
+            }
+        } catch (IOException e) {
+
+        }
+        return false;
+    }
+
     private void handleEmbedded(TikaInputStream tis, Metadata metadata) throws 
MimeException, IOException {
 
         String disposition = metadata.get(Metadata.CONTENT_DISPOSITION);
@@ -516,7 +551,7 @@ class MailContentHandler implements ContentHandler {
         }
 
         if (part instanceof BodyContents) {
-            handlePart((BodyContents)part);
+            handleInlineBodyPart((BodyContents)part);
             return;
         }
 
@@ -539,7 +574,7 @@ class MailContentHandler implements ContentHandler {
         }
     }
 
-    private void handlePart(BodyContents part) throws MimeException, 
IOException {
+    private void handleInlineBodyPart(BodyContents part) throws MimeException, 
IOException {
         String contentType = part.metadata.get(Metadata.CONTENT_TYPE);
         Parser parser = null;
         if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) {
@@ -555,6 +590,7 @@ class MailContentHandler implements ContentHandler {
 
 
         if (parser == null) {
+            //back off and treat it as an embedded chunk
             try (TikaInputStream tis = TikaInputStream.get(part.bytes)) {
                 handleEmbedded(tis, part.metadata);
             }
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index 7b48f13..0e8c237 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -260,6 +260,25 @@ public class RFC822ParserTest extends TikaTest {
                 metadata.get(Metadata.SUBJECT));
     }
 
+    @Test
+    public void testMainBody() throws Exception {
+        //test that the first text or html chunk is processed in the main body
+        //not treated as an attachment. TIKA-2547
+        List<Metadata> metadataList = 
getRecursiveMetadata("testRFC822_oddfrom");
+        assertEquals(7, metadataList.size());
+        assertContains("Air Quality Planning", 
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+        //Make sure text alternative doesn't get treated as an attachment
+        metadataList = getRecursiveMetadata("testRFC822_normal_zip");
+        assertEquals(3, metadataList.size());
+        assertContains("This is the HTML part", 
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("application/zip", 
metadataList.get(2).get(Metadata.CONTENT_TYPE));
+
+        metadataList = getRecursiveMetadata("testRFC822-txt-body");
+        assertEquals(2, metadataList.size());
+        assertContains("body 1", 
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+    }
+
     /**
      * Test for TIKA-640, increase header max beyond 10k bytes
      */
diff --git a/tika-parsers/src/test/resources/test-documents/testRFC822-txt-body 
b/tika-parsers/src/test/resources/test-documents/testRFC822-txt-body
new file mode 100644
index 0000000..de28397
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testRFC822-txt-body
@@ -0,0 +1,35 @@
+MIME-Version: 1.0
+Received: by 10.103.33.199 with HTTP; Tue, 6 Jun 2017 14:48:27 -0700 (PDT)
+Bcc: 
emailtosalesfo...@r-kub1lq8760pccrdt39x94qxtajhk3q4zb1fzikf15ygnugofn.6a-euhkuaa.na50.le.salesforce.com
+Date: Tue, 6 Jun 2017 14:48:27 -0700
+Delivered-To: john....@gmail.com
+Message-ID: 
<CACmaLAZ16kghp1Qf99noL2P33AnAzU7bbKfju=jasryyscv...@mail.gmail.com>
+Subject: Test BCCing email (rev 2)
+From: John Doe <john....@gmail.com>
+To: john.sm...@domain.com
+Content-Type: multipart/mixed; boundary="94eb2c03266668996305515194b6"
+
+This is a multipart message in MIME format.
+
+--94eb2c03266668996305515194b6
+Content-Type: text/plain; charset="UTF-8"
+Content-Transfer-Encoding: quoted-printable
+
+This is an email that will have some rich text and an attachment.
+
+*Because I've added some bold text here.*
+
+body 1
+*=E2=80=8B*
+*And here's some more text (still bold)*
+
+-- John
+
+--94eb2c03266668996305515194b6
+Content-Type: image/jpeg; name="mary-coffee.jpg"
+Content-Disposition: attachment; filename="mary-coffee.jpg"
+Content-Transfer-Encoding: base64
+X-Attachment-Id: f_j3m3jfpq1
+
+
+--94eb2c03266668996305515194b6--
\ No newline at end of file

-- 
To stop receiving notification emails like this one, please contact
talli...@apache.org.

Reply via email to