TIKA-2037 RFC822Parser should wrap the James InputStream of embedded resources to avoid problems with downstream detection or extraction
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/31374a39 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/31374a39 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/31374a39 Branch: refs/heads/2.x Commit: 31374a39bae03bfc260f73662c133467637193f1 Parents: d6ce10b Author: Nick Burch <[email protected]> Authored: Wed Jul 20 18:15:25 2016 +0100 Committer: Nick Burch <[email protected]> Committed: Tue Jul 26 12:05:47 2016 +0100 ---------------------------------------------------------------------- CHANGES.txt | 3 + .../apache/tika/parser/mbox/MboxParserTest.java | 1 - .../tika/parser/mail/MailContentHandler.java | 11 ++-- .../tika/parser/mail/RFC822ParserTest.java | 68 +++++++++++++++++++- 4 files changed, 77 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/31374a39/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index da9353d..a4fca50 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -22,6 +22,9 @@ Release 1.14 - ??? * Maintain more significant digits in cells of "General" format in XLS and XLSX (TIKA-2025). + * Avoid mark/reset issues when extracting or detecting embedded resources + in RFC822 emails (TIKA-2037). + * Improve extraction of embedded documents for PPT, PPTX and XLSX (TIKA-2026). http://git-wip-us.apache.org/repos/asf/tika/blob/31374a39/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java index 6ef803d..94c4e70 100644 --- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java @@ -152,5 +152,4 @@ public class MboxParserTest { assertContains("When a Mapper completes", handler.toString()); } - } http://git-wip-us.apache.org/repos/asf/tika/blob/31374a39/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java index 2c8942e..9c16c8c 100644 --- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java +++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java @@ -16,6 +16,9 @@ */ package org.apache.tika.parser.mail; +import static org.apache.tika.utils.DateUtils.MIDDAY; +import static org.apache.tika.utils.DateUtils.UTC; + import java.io.IOException; import java.io.InputStream; import java.text.DateFormat; @@ -55,9 +58,6 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; -import static org.apache.tika.utils.DateUtils.MIDDAY; -import static org.apache.tika.utils.DateUtils.UTC; - /** * Bridge between mime4j's content handler and the generic Sax content handler * used by Tika. See @@ -176,7 +176,10 @@ class MailContentHandler implements ContentHandler { try { if (extractor.shouldParseEmbedded(submd)) { - extractor.parseEmbedded(is, handler, submd, false); + // Wrap the InputStream before passing on, as the James provided + // one misses many features we might want eg mark/reset + TikaInputStream tis = TikaInputStream.get(is); + extractor.parseEmbedded(tis, handler, submd, false); } } catch (SAXException e) { throw new MimeException(e); http://git-wip-us.apache.org/repos/asf/tika/blob/31374a39/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java index ee9a98b..3be1edd 100644 --- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java +++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java @@ -27,24 +27,30 @@ import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import java.io.ByteArrayInputStream; -import java.io.File; +import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.text.DateFormat; import java.text.DateFormatSymbols; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Date; +import java.util.List; import java.util.Locale; import org.apache.james.mime4j.stream.MimeConfig; import org.apache.tika.TikaTest; +import org.apache.tika.detect.DefaultDetector; +import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.ContainerExtractor; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.ParserContainerExtractor; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; @@ -53,6 +59,7 @@ import org.apache.tika.sax.XHTMLContentHandler; import org.junit.Test; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class RFC822ParserTest extends TikaTest { @@ -482,4 +489,63 @@ public class RFC822ParserTest extends TikaTest { p.parse(TikaInputStream.get(s.getBytes(StandardCharsets.UTF_8)), new DefaultHandler(), m, new ParseContext()); assertEquals("I Urge You to Require Notice of Mercury", m.get(TikaCoreProperties.TITLE)); } + + @Test + public void testExtractAttachments() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + Parser p = new RFC822Parser(); + ParseContext context = new ParseContext(); + + try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) { + p.parse(stream, handler, metadata, context); + } + + // Check we go the metadata + assertEquals("Tika Test <[email protected]>", metadata.get(Metadata.MESSAGE_FROM)); + assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE)); + + // Try again with attachment detecting and fetching + final Detector detector = new DefaultDetector(); + final Parser extParser = new AutoDetectParser(); + final List<MediaType> seenTypes = new ArrayList<MediaType>(); + final List<String> seenText = new ArrayList<String>(); + EmbeddedDocumentExtractor ext = new EmbeddedDocumentExtractor() { + @Override + public boolean shouldParseEmbedded(Metadata metadata) { + return true; + } + + @Override + public void parseEmbedded(InputStream stream, ContentHandler handler, + Metadata metadata, boolean outputHtml) throws SAXException, + IOException { + seenTypes.add( detector.detect(stream, metadata) ); + + ContentHandler h = new BodyContentHandler(); + try { + extParser.parse(stream, h, metadata, new ParseContext()); + } catch (TikaException e) { + throw new RuntimeException(e); + } + seenText.add(h.toString()); + } + }; + context.set(EmbeddedDocumentExtractor.class, ext); + + try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) { + p.parse(stream, handler, metadata, context); + } + + // Check we go the metadata + assertEquals("Tika Test <[email protected]>", metadata.get(Metadata.MESSAGE_FROM)); + assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE)); + + // Check attachments + assertEquals(2, seenTypes.size()); + assertEquals(2, seenText.size()); + assertEquals("text/plain", seenTypes.get(0).toString()); + assertEquals("image/png", seenTypes.get(1).toString()); + assertEquals("This email has a PNG attachment included in it\n\n", seenText.get(0)); + } }
