[ 
https://issues.apache.org/jira/browse/TIKA-1788?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16216124#comment-16216124
 ] 

ASF GitHub Bot commented on TIKA-1788:
--------------------------------------

tballison closed pull request #211: [TIKA-1788] RFC822Parser: provide email 
attachment filenames when available
URL: https://github.com/apache/tika/pull/211
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 961ccfba1..7476347d5 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -27,6 +27,8 @@
 import java.text.SimpleDateFormat;
 import java.util.Date;
 import java.util.Locale;
+import java.util.Map;
+import java.util.Map.Entry;
 import java.util.TimeZone;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -44,6 +46,7 @@
 import org.apache.james.mime4j.dom.field.ParsedField;
 import org.apache.james.mime4j.dom.field.UnstructuredField;
 import org.apache.james.mime4j.field.LenientFieldParser;
+import org.apache.james.mime4j.message.MaximalBodyDescriptor;
 import org.apache.james.mime4j.parser.ContentHandler;
 import org.apache.james.mime4j.stream.BodyDescriptor;
 import org.apache.james.mime4j.stream.Field;
@@ -151,6 +154,26 @@ public void body(BodyDescriptor body, InputStream is) 
throws MimeException,
         submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
         submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
 
+        if (body instanceof MaximalBodyDescriptor) {
+            MaximalBodyDescriptor maximalBody = (MaximalBodyDescriptor) body;
+            String contentDispositionType = 
maximalBody.getContentDispositionType();
+            if (contentDispositionType != null && 
!contentDispositionType.isEmpty()) {
+                StringBuilder contentDisposition = new StringBuilder( 
contentDispositionType );
+                Map<String, String> contentDispositionParameters = 
maximalBody.getContentDispositionParameters();
+                for ( Entry<String, String> param : 
contentDispositionParameters.entrySet() ) {
+                    contentDisposition.append("; ")
+                                      
.append(param.getKey()).append("=\"").append(param.getValue()).append('"');
+                }
+
+                String contentDispositionFileName = 
maximalBody.getContentDispositionFilename();
+                if ( contentDispositionFileName != null ) {
+                    submd.set( Metadata.RESOURCE_NAME_KEY, 
contentDispositionFileName );
+                }
+
+                submd.set( Metadata.CONTENT_DISPOSITION, 
contentDisposition.toString() );
+            }
+        }
+
         try {
             if (extractor.shouldParseEmbedded(submd)) {
                 // Wrap the InputStream before passing on, as the James 
provided
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
index ff546f48a..03bce971d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
@@ -22,6 +22,7 @@
 import java.util.Set;
 
 import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.message.DefaultBodyDescriptorBuilder;
 import org.apache.james.mime4j.parser.MimeStreamParser;
 import org.apache.james.mime4j.stream.MimeConfig;
 import org.apache.tika.exception.TikaException;
@@ -67,7 +68,7 @@ public void parse(InputStream stream, ContentHandler handler,
 
         config = context.get(MimeConfig.class, config);
 
-        MimeStreamParser parser = new MimeStreamParser(config);
+        MimeStreamParser parser = new MimeStreamParser(config, null, new 
DefaultBodyDescriptorBuilder());
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
 
         MailContentHandler mch = new MailContentHandler(
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index 025273c77..09cad0cf0 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -425,7 +425,8 @@ public void testGetAttachmentsAsEmbeddedResources() throws 
Exception {
         // No filenames available
         assertEquals(null, tracker.filenames.get(0));
         assertEquals(null, tracker.filenames.get(1));
-        assertEquals(null, tracker.filenames.get(2));
+        // Except for this using Content-Disposition filename field
+        assertEquals("logo.gif", tracker.filenames.get(2));
         // Types are available
         assertEquals(MediaType.TEXT_PLAIN, tracker.mediaTypes.get(0));
         assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1));
@@ -560,6 +561,8 @@ public void testExtractAttachments() throws Exception {
         final Parser extParser = new AutoDetectParser();
         final List<MediaType> seenTypes = new ArrayList<MediaType>();
         final List<String> seenText = new ArrayList<String>();
+        final List<String> seenNames = new ArrayList<String>();
+        final List<String> seenContentDisposition = new ArrayList<String>();
         EmbeddedDocumentExtractor ext = new EmbeddedDocumentExtractor() {
             @Override
             public boolean shouldParseEmbedded(Metadata metadata) {
@@ -570,7 +573,9 @@ public boolean shouldParseEmbedded(Metadata metadata) {
             public void parseEmbedded(InputStream stream, ContentHandler 
handler,
                     Metadata metadata, boolean outputHtml) throws SAXException,
                     IOException {
+                seenNames.add( metadata.get(Metadata.RESOURCE_NAME_KEY) );
                 seenTypes.add( detector.detect(stream, metadata) );
+                seenContentDisposition.add( 
metadata.get(Metadata.CONTENT_DISPOSITION) );
                 
                 ContentHandler h = new BodyContentHandler();
                 try {
@@ -596,6 +601,52 @@ public void parseEmbedded(InputStream stream, 
ContentHandler handler,
         assertEquals(2, seenText.size());
         assertEquals("text/plain", seenTypes.get(0).toString());
         assertEquals("image/png", seenTypes.get(1).toString());
+        assertEquals("testPNG.png", seenNames.get(1));
         assertEquals("This email has a PNG attachment included in it\n\n", 
seenText.get(0));
+        assertEquals(null, seenContentDisposition.get(0));
+        assertEquals("attachment; filename=\"testPNG.png\"", 
seenContentDisposition.get(1));
+    }
+
+    @Test
+    public void testEmbeddedMetadata() throws Exception {
+        Metadata metadata = new Metadata();
+        Parser p = new RFC822Parser();
+        ParseContext context = new ParseContext();
+        final Parser extParser = new AutoDetectParser();
+        final List<Metadata> seenMetadata = new ArrayList<>();
+        EmbeddedDocumentExtractor ext = new EmbeddedDocumentExtractor() {
+            @Override
+            public boolean shouldParseEmbedded(Metadata metadata) {
+                return true;
+            }
+
+            @Override
+            public void parseEmbedded(InputStream stream, ContentHandler 
handler,
+                                      Metadata metadata, boolean outputHtml) 
throws SAXException,
+                                                                               
     IOException {
+                seenMetadata.add( metadata );
+                try {
+                    extParser.parse(stream, new DefaultHandler(), metadata, 
new ParseContext());
+                } catch (TikaException e) {
+                    throw new RuntimeException(e);
+                }
+            }
+        };
+        context.set(EmbeddedDocumentExtractor.class, ext);
+
+        try(InputStream stream = getStream( 
"test-documents/testRFC822-multipart" )) {
+            p.parse(stream, new DefaultHandler(), metadata, context);
+        }
+
+        assertEquals(3, seenMetadata.size());
+        assertEquals(null, 
seenMetadata.get(0).get(Metadata.CONTENT_DISPOSITION));
+        assertEquals("text/plain; charset=UTF-8", 
seenMetadata.get(0).get(Metadata.CONTENT_TYPE));
+        assertEquals("UTF-8", 
seenMetadata.get(0).get(Metadata.CONTENT_ENCODING));
+        assertEquals(null, 
seenMetadata.get(1).get(Metadata.CONTENT_DISPOSITION));
+        assertEquals("text/html; charset=UTF-8", 
seenMetadata.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals("UTF-8", 
seenMetadata.get(1).get(Metadata.CONTENT_ENCODING));
+        assertEquals("attachment; filename=\"logo.gif\"", 
seenMetadata.get(2).get(Metadata.CONTENT_DISPOSITION));
+        assertEquals("logo.gif", 
seenMetadata.get(2).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("image/gif", 
seenMetadata.get(2).get(Metadata.CONTENT_TYPE));
     }
 }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> message/rfc822 parser doesn't identify attachment filenames from 
> Content-Disposition header
> -------------------------------------------------------------------------------------------
>
>                 Key: TIKA-1788
>                 URL: https://issues.apache.org/jira/browse/TIKA-1788
>             Project: Tika
>          Issue Type: Bug
>    Affects Versions: 1.11
>            Reporter: Sergey Tsalkov
>            Assignee: Tim Allison
>         Attachments: grep_content_disposition.zip
>
>
> rfc822 email files can contain attachments as subparts, and they'll
> generally specify the filename of the attachment in a manner like
> this:
> Content-Disposition: attachment;
>         filename*=utf-8''image001.jpg
> Tika doesn't seem to be grabbing that information at all!



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to