Repository: tika Updated Branches: refs/heads/master 3ecdc0cb0 -> 952fb54ed
TIKA-2037 RFC822Parser should wrap the James InputStream of embedded resources to avoid problems with downstream detection or extraction Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/952fb54e Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/952fb54e Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/952fb54e Branch: refs/heads/master Commit: 952fb54ed78a2fba07db4653cc674f5641211031 Parents: 3ecdc0c Author: Nick Burch <[email protected]> Authored: Wed Jul 20 18:15:25 2016 +0100 Committer: Nick Burch <[email protected]> Committed: Wed Jul 20 18:15:25 2016 +0100 ---------------------------------------------------------------------- CHANGES.txt | 3 + .../tika/parser/mail/MailContentHandler.java | 13 ++-- .../tika/parser/mail/RFC822ParserTest.java | 68 +++++++++++++++++++- .../apache/tika/parser/mbox/MboxParserTest.java | 1 - 4 files changed, 77 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/952fb54e/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index f6191b4..6ba831f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,8 @@ Release 1.14 - ??? + * Avoid mark/reset issues when extracting or detecting embedded resources + in RFC822 emails (TIKA-2037). + * Improving accuracy of Tesseract for better extraction of numeric and alphanumeric text from images (TIKA-2021). http://git-wip-us.apache.org/repos/asf/tika/blob/952fb54e/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java index 8d16961..6a9bc1b 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java @@ -16,6 +16,9 @@ */ package org.apache.tika.parser.mail; +import static org.apache.tika.utils.DateUtils.MIDDAY; +import static org.apache.tika.utils.DateUtils.UTC; + import java.io.IOException; import java.io.InputStream; import java.text.DateFormat; @@ -44,9 +47,7 @@ import org.apache.james.mime4j.field.LenientFieldParser; import org.apache.james.mime4j.parser.ContentHandler; import org.apache.james.mime4j.stream.BodyDescriptor; import org.apache.james.mime4j.stream.Field; -import org.apache.james.mime4j.util.ByteSequence; import org.apache.tika.config.TikaConfig; -import org.apache.tika.detect.AutoDetectReader; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; import org.apache.tika.io.TikaInputStream; @@ -58,9 +59,6 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; -import static org.apache.tika.utils.DateUtils.MIDDAY; -import static org.apache.tika.utils.DateUtils.UTC; - /** * Bridge between mime4j's content handler and the generic Sax content handler * used by Tika. See @@ -179,7 +177,10 @@ class MailContentHandler implements ContentHandler { try { if (extractor.shouldParseEmbedded(submd)) { - extractor.parseEmbedded(is, handler, submd, false); + // Wrap the InputStream before passing on, as the James provided + // one misses many features we might want eg mark/reset + TikaInputStream tis = TikaInputStream.get(is); + extractor.parseEmbedded(tis, handler, submd, false); } } catch (SAXException e) { throw new MimeException(e); http://git-wip-us.apache.org/repos/asf/tika/blob/952fb54e/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java index 0d3a2c5..c7fcbfb 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java @@ -31,24 +31,30 @@ import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import java.io.ByteArrayInputStream; +import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.text.DateFormat; import java.text.DateFormatSymbols; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Date; +import java.util.List; import java.util.Locale; -import java.util.TimeZone; import org.apache.james.mime4j.stream.MimeConfig; import org.apache.tika.TikaTest; +import org.apache.tika.detect.DefaultDetector; +import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.ContainerExtractor; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.ParserContainerExtractor; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; @@ -58,6 +64,7 @@ import org.apache.tika.sax.XHTMLContentHandler; import org.junit.Test; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class RFC822ParserTest extends TikaTest { @@ -496,4 +503,63 @@ public class RFC822ParserTest extends TikaTest { assertEquals("I Urge You to Require Notice of Mercury", m.get(TikaCoreProperties.TITLE)); } + + @Test + public void testExtractAttachments() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + Parser p = new RFC822Parser(); + ParseContext context = new ParseContext(); + + try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) { + p.parse(stream, handler, metadata, context); + } + + // Check we go the metadata + assertEquals("Tika Test <[email protected]>", metadata.get(Metadata.MESSAGE_FROM)); + assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE)); + + // Try again with attachment detecting and fetching + final Detector detector = new DefaultDetector(); + final Parser extParser = new AutoDetectParser(); + final List<MediaType> seenTypes = new ArrayList<MediaType>(); + final List<String> seenText = new ArrayList<String>(); + EmbeddedDocumentExtractor ext = new EmbeddedDocumentExtractor() { + @Override + public boolean shouldParseEmbedded(Metadata metadata) { + return true; + } + + @Override + public void parseEmbedded(InputStream stream, ContentHandler handler, + Metadata metadata, boolean outputHtml) throws SAXException, + IOException { + seenTypes.add( detector.detect(stream, metadata) ); + + ContentHandler h = new BodyContentHandler(); + try { + extParser.parse(stream, h, metadata, new ParseContext()); + } catch (TikaException e) { + throw new RuntimeException(e); + } + seenText.add(h.toString()); + } + }; + context.set(EmbeddedDocumentExtractor.class, ext); + + try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) { + p.parse(stream, handler, metadata, context); + } + + // Check we go the metadata + assertEquals("Tika Test <[email protected]>", metadata.get(Metadata.MESSAGE_FROM)); + assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE)); + + // Check attachments + assertEquals(2, seenTypes.size()); + assertEquals(2, seenText.size()); + assertEquals("text/plain", seenTypes.get(0).toString()); + assertEquals("image/png", seenTypes.get(1).toString()); + assertEquals("This email has a PNG attachment included in it\n\n", seenText.get(0)); + } } http://git-wip-us.apache.org/repos/asf/tika/blob/952fb54e/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java index 6ef803d..94c4e70 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java @@ -152,5 +152,4 @@ public class MboxParserTest { assertContains("When a Mapper completes", handler.toString()); } - }
