TIKA-1970: add special handling for Mac Mail's date format
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/912798a3 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/912798a3 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/912798a3 Branch: refs/heads/2.x Commit: 912798a3f7142983bc44a62c82e19687aa9009a0 Parents: 464ad91 Author: tballison <[email protected]> Authored: Tue May 17 16:09:16 2016 -0400 Committer: tballison <[email protected]> Committed: Tue May 17 16:09:16 2016 -0400 ---------------------------------------------------------------------- .../tika/parser/mail/MailContentHandler.java | 58 +++++++++++++++++++- .../tika/parser/mail/RFC822ParserTest.java | 11 ++++ 2 files changed, 67 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/912798a3/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java index cc7eeaf..8b00004 100644 --- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java +++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java @@ -18,6 +18,15 @@ package org.apache.tika.parser.mail; import java.io.IOException; import java.io.InputStream; +import java.text.DateFormat; +import java.text.DateFormatSymbols; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Locale; +import java.util.TimeZone; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.james.mime4j.MimeException; import org.apache.james.mime4j.codec.DecodeMonitor; @@ -46,6 +55,8 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; +import static org.apache.tika.utils.DateUtils.UTC; + /** * Bridge between mime4j's content handler and the generic Sax content handler * used by Tika. See @@ -53,6 +64,25 @@ import org.xml.sax.SAXException; */ class MailContentHandler implements ContentHandler { + //TIKA-1970 Mac Mail's format + private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN = + Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z"); + + private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] { + //16 May 2016 at 09:30:32 GMT+1 + createDateFormat("dd MMM yyyy 'at' HH:mm:ss z", UTC), // UTC/Zulu + }; + + private static DateFormat createDateFormat(String format, TimeZone timezone) { + SimpleDateFormat sdf = + new SimpleDateFormat(format, new DateFormatSymbols(Locale.US)); + if (timezone != null) { + sdf.setTimeZone(timezone); + } + return sdf; + } + + private boolean strictParsing = false; private XHTMLContentHandler handler; @@ -152,7 +182,8 @@ class MailContentHandler implements ContentHandler { /** * Header for the whole message or its parts * - * @see http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ + * @see <a href="http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/"> + * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/</a> * Field.html */ public void field(Field field) throws MimeException { @@ -197,7 +228,11 @@ class MailContentHandler implements ContentHandler { processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC); } else if (fieldname.equalsIgnoreCase("Date")) { DateTimeField dateField = (DateTimeField) parsedField; - metadata.set(TikaCoreProperties.CREATED, dateField.getDate()); + Date date = dateField.getDate(); + if (date == null) { + date = tryOtherDateFormats(field.getBody()); + } + metadata.set(TikaCoreProperties.CREATED, date); } } catch (RuntimeException me) { if (strictParsing) { @@ -206,6 +241,25 @@ class MailContentHandler implements ContentHandler { } } + private static synchronized Date tryOtherDateFormats(String text) { + if (text == null) { + return null; + } + text = text.replaceAll("\\s+", " ").trim(); + Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text); + if (matcher.find()) { + text = matcher.replaceFirst("GMT$1$2:00"); + } + + for (DateFormat format : ALTERNATE_DATE_FORMATS) { + try { + return format.parse(text); + } catch (ParseException e) { + } + } + return null; + } + private void processAddressList(ParsedField field, String addressListType, String metadataField) throws MimeException { AddressListField toField = (AddressListField) field; http://git-wip-us.apache.org/repos/asf/tika/blob/912798a3/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java index c8c6624..e598f59 100644 --- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java +++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java @@ -384,4 +384,15 @@ public class RFC822ParserTest extends TikaTest { r = getXML("testRFC822_eml"); assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE)); } + + @Test + public void testDates() throws Exception { + + //tests non-standard dates that mime4j can't parse + XMLResult r = getXML("testRFC822_date_utf8"); + assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED)); + + r = getXML("testRFC822_eml"); + assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED)); + } }
