TIKA-1970: add special handling for Mac Mail's date format

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/912798a3
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/912798a3
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/912798a3

Branch: refs/heads/2.x
Commit: 912798a3f7142983bc44a62c82e19687aa9009a0
Parents: 464ad91
Author: tballison <[email protected]>
Authored: Tue May 17 16:09:16 2016 -0400
Committer: tballison <[email protected]>
Committed: Tue May 17 16:09:16 2016 -0400

----------------------------------------------------------------------
 .../tika/parser/mail/MailContentHandler.java    | 58 +++++++++++++++++++-
 .../tika/parser/mail/RFC822ParserTest.java      | 11 ++++
 2 files changed, 67 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/912798a3/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
 
b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index cc7eeaf..8b00004 100644
--- 
a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ 
b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -18,6 +18,15 @@ package org.apache.tika.parser.mail;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.text.DateFormat;
+import java.text.DateFormatSymbols;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Locale;
+import java.util.TimeZone;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.james.mime4j.MimeException;
 import org.apache.james.mime4j.codec.DecodeMonitor;
@@ -46,6 +55,8 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
+import static org.apache.tika.utils.DateUtils.UTC;
+
 /**
  * Bridge between mime4j's content handler and the generic Sax content handler
  * used by Tika. See
@@ -53,6 +64,25 @@ import org.xml.sax.SAXException;
  */
 class MailContentHandler implements ContentHandler {
 
+    //TIKA-1970 Mac Mail's format
+    private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
+            Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
+
+    private static final DateFormat[] ALTERNATE_DATE_FORMATS = new 
DateFormat[] {
+            //16 May 2016 at 09:30:32  GMT+1
+            createDateFormat("dd MMM yyyy 'at' HH:mm:ss z", UTC),   // UTC/Zulu
+    };
+
+    private static DateFormat createDateFormat(String format, TimeZone 
timezone) {
+        SimpleDateFormat sdf =
+                new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
+        if (timezone != null) {
+            sdf.setTimeZone(timezone);
+        }
+        return sdf;
+    }
+
+
     private boolean strictParsing = false;
 
     private XHTMLContentHandler handler;
@@ -152,7 +182,8 @@ class MailContentHandler implements ContentHandler {
     /**
      * Header for the whole message or its parts
      *
-     * @see 
http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/
+     * @see <a 
href="http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/";>
+     *     
http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/</a>
      * Field.html
      */
     public void field(Field field) throws MimeException {
@@ -197,7 +228,11 @@ class MailContentHandler implements ContentHandler {
                 processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
             } else if (fieldname.equalsIgnoreCase("Date")) {
                 DateTimeField dateField = (DateTimeField) parsedField;
-                metadata.set(TikaCoreProperties.CREATED, dateField.getDate());
+                Date date = dateField.getDate();
+                if (date == null) {
+                    date = tryOtherDateFormats(field.getBody());
+                }
+                metadata.set(TikaCoreProperties.CREATED, date);
             }
         } catch (RuntimeException me) {
             if (strictParsing) {
@@ -206,6 +241,25 @@ class MailContentHandler implements ContentHandler {
         }
     }
 
+    private static synchronized Date tryOtherDateFormats(String text) {
+        if (text == null) {
+            return null;
+        }
+        text = text.replaceAll("\\s+", " ").trim();
+        Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text);
+        if (matcher.find()) {
+            text = matcher.replaceFirst("GMT$1$2:00");
+        }
+
+        for (DateFormat format : ALTERNATE_DATE_FORMATS) {
+            try {
+                return format.parse(text);
+            } catch (ParseException e) {
+            }
+        }
+        return null;
+    }
+
     private void processAddressList(ParsedField field, String addressListType,
                                     String metadataField) throws MimeException 
{
         AddressListField toField = (AddressListField) field;

http://git-wip-us.apache.org/repos/asf/tika/blob/912798a3/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
 
b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index c8c6624..e598f59 100644
--- 
a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ 
b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -384,4 +384,15 @@ public class RFC822ParserTest extends TikaTest {
         r = getXML("testRFC822_eml");
         assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
     }
+
+    @Test
+    public void testDates() throws Exception {
+
+        //tests non-standard dates that mime4j can't parse
+        XMLResult r = getXML("testRFC822_date_utf8");
+        assertEquals("2016-05-16T08:30:32Z", 
r.metadata.get(TikaCoreProperties.CREATED));
+
+        r = getXML("testRFC822_eml");
+        assertEquals("2016-05-16T08:30:32Z", 
r.metadata.get(TikaCoreProperties.CREATED));
+    }
 }

Reply via email to