TIKA-1970 - Mac Mail date of interesting format not parsed by james mime4j
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/09cc658a Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/09cc658a Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/09cc658a Branch: refs/heads/master Commit: 09cc658a30ae194323f1e444be73d396544efc6b Parents: e08d006 Author: tballison <[email protected]> Authored: Tue May 17 10:54:02 2016 -0400 Committer: tballison <[email protected]> Committed: Tue May 17 10:54:02 2016 -0400 ---------------------------------------------------------------------- .../tika/parser/mail/MailContentHandler.java | 62 +++++++++++++++++++- .../tika/parser/mail/RFC822ParserTest.java | 15 +++++ 2 files changed, 76 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/09cc658a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java index 5369c1d..5a6984c 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java @@ -18,6 +18,15 @@ package org.apache.tika.parser.mail; import java.io.IOException; import java.io.InputStream; +import java.text.DateFormat; +import java.text.DateFormatSymbols; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Locale; +import java.util.TimeZone; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.james.mime4j.MimeException; import org.apache.james.mime4j.codec.DecodeMonitor; @@ -35,9 +44,12 @@ import org.apache.james.mime4j.field.LenientFieldParser; import org.apache.james.mime4j.parser.ContentHandler; import org.apache.james.mime4j.stream.BodyDescriptor; import org.apache.james.mime4j.stream.Field; +import org.apache.james.mime4j.util.ByteSequence; import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.AutoDetectReader; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; @@ -46,6 +58,9 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; +import static org.apache.tika.utils.DateUtils.MIDDAY; +import static org.apache.tika.utils.DateUtils.UTC; + /** * Bridge between mime4j's content handler and the generic Sax content handler * used by Tika. See @@ -53,6 +68,27 @@ import org.xml.sax.SAXException; */ class MailContentHandler implements ContentHandler { + //TIKA-1970 Mac Mail's format + private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN = + Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z"); + + private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] { + //16 May 2016 at 09:30:32 GMT+1 + createDateFormat("dd MMM yyyy 'at' HH:mm:ss z", UTC), // UTC/Zulu + }; + + private static DateFormat createDateFormat(String format, TimeZone timezone) { + SimpleDateFormat sdf = + new SimpleDateFormat(format, new DateFormatSymbols(Locale.US)); + if (timezone != null) { + sdf.setTimeZone(timezone); + } + return sdf; + } + + + + private boolean strictParsing = false; private XHTMLContentHandler handler; @@ -197,15 +233,39 @@ class MailContentHandler implements ContentHandler { processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC); } else if (fieldname.equalsIgnoreCase("Date")) { DateTimeField dateField = (DateTimeField) parsedField; - metadata.set(TikaCoreProperties.CREATED, dateField.getDate()); + Date date = dateField.getDate(); + if (date == null) { + date = tryOtherDateFormats(field.getBody()); + } + metadata.set(TikaCoreProperties.CREATED, date); } } catch (RuntimeException me) { + me.printStackTrace(); if (strictParsing) { throw me; } } } + private static synchronized Date tryOtherDateFormats(String text) { + if (text == null) { + return null; + } + text = text.replaceAll("\\s+", " ").trim(); + Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text); + if (matcher.find()) { + text = matcher.replaceFirst("GMT$1$2:00"); + } + + for (DateFormat format : ALTERNATE_DATE_FORMATS) { + try { + return format.parse(text); + } catch (ParseException e) { + } + } + return null; + } + private void processAddressList(ParsedField field, String addressListType, String metadataField) throws MimeException { AddressListField toField = (AddressListField) field; http://git-wip-us.apache.org/repos/asf/tika/blob/09cc658a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java index 0e8f613..6a69ea5 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java @@ -32,6 +32,9 @@ import static org.mockito.Mockito.verify; import java.io.ByteArrayInputStream; import java.io.InputStream; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.TimeZone; import org.apache.james.mime4j.stream.MimeConfig; import org.apache.tika.TikaTest; @@ -392,4 +395,16 @@ public class RFC822ParserTest extends TikaTest { r = getXML("testRFC822_eml"); assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE)); } + + @Test + public void testDates() throws Exception { + + //tests non-standard dates that mime4j can't parse + XMLResult r = getXML("testRFC822_date_utf8"); + assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED)); + + r = getXML("testRFC822_eml"); + assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED)); + } + }
