TIKA-1970 - Mac Mail date of interesting format not parsed by james mime4j

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/09cc658a
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/09cc658a
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/09cc658a

Branch: refs/heads/master
Commit: 09cc658a30ae194323f1e444be73d396544efc6b
Parents: e08d006
Author: tballison <[email protected]>
Authored: Tue May 17 10:54:02 2016 -0400
Committer: tballison <[email protected]>
Committed: Tue May 17 10:54:02 2016 -0400

----------------------------------------------------------------------
 .../tika/parser/mail/MailContentHandler.java    | 62 +++++++++++++++++++-
 .../tika/parser/mail/RFC822ParserTest.java      | 15 +++++
 2 files changed, 76 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/09cc658a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 5369c1d..5a6984c 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -18,6 +18,15 @@ package org.apache.tika.parser.mail;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.text.DateFormat;
+import java.text.DateFormatSymbols;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Locale;
+import java.util.TimeZone;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.james.mime4j.MimeException;
 import org.apache.james.mime4j.codec.DecodeMonitor;
@@ -35,9 +44,12 @@ import org.apache.james.mime4j.field.LenientFieldParser;
 import org.apache.james.mime4j.parser.ContentHandler;
 import org.apache.james.mime4j.stream.BodyDescriptor;
 import org.apache.james.mime4j.stream.Field;
+import org.apache.james.mime4j.util.ByteSequence;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.AutoDetectReader;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
@@ -46,6 +58,9 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
+import static org.apache.tika.utils.DateUtils.MIDDAY;
+import static org.apache.tika.utils.DateUtils.UTC;
+
 /**
  * Bridge between mime4j's content handler and the generic Sax content handler
  * used by Tika. See
@@ -53,6 +68,27 @@ import org.xml.sax.SAXException;
  */
 class MailContentHandler implements ContentHandler {
 
+    //TIKA-1970 Mac Mail's format
+    private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
+            Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
+
+    private static final DateFormat[] ALTERNATE_DATE_FORMATS = new 
DateFormat[] {
+            //16 May 2016 at 09:30:32  GMT+1
+            createDateFormat("dd MMM yyyy 'at' HH:mm:ss z", UTC),   // UTC/Zulu
+    };
+
+    private static DateFormat createDateFormat(String format, TimeZone 
timezone) {
+        SimpleDateFormat sdf =
+                new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
+        if (timezone != null) {
+            sdf.setTimeZone(timezone);
+        }
+        return sdf;
+    }
+
+
+
+
     private boolean strictParsing = false;
 
     private XHTMLContentHandler handler;
@@ -197,15 +233,39 @@ class MailContentHandler implements ContentHandler {
                 processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
             } else if (fieldname.equalsIgnoreCase("Date")) {
                 DateTimeField dateField = (DateTimeField) parsedField;
-                metadata.set(TikaCoreProperties.CREATED, dateField.getDate());
+                Date date = dateField.getDate();
+                if (date == null) {
+                    date = tryOtherDateFormats(field.getBody());
+                }
+                metadata.set(TikaCoreProperties.CREATED, date);
             }
         } catch (RuntimeException me) {
+            me.printStackTrace();
             if (strictParsing) {
                 throw me;
             }
         }
     }
 
+    private static synchronized Date tryOtherDateFormats(String text) {
+        if (text == null) {
+            return null;
+        }
+        text = text.replaceAll("\\s+", " ").trim();
+        Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text);
+        if (matcher.find()) {
+            text = matcher.replaceFirst("GMT$1$2:00");
+        }
+
+        for (DateFormat format : ALTERNATE_DATE_FORMATS) {
+            try {
+                return format.parse(text);
+            } catch (ParseException e) {
+            }
+        }
+        return null;
+    }
+
     private void processAddressList(ParsedField field, String addressListType,
                                     String metadataField) throws MimeException 
{
         AddressListField toField = (AddressListField) field;

http://git-wip-us.apache.org/repos/asf/tika/blob/09cc658a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index 0e8f613..6a69ea5 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -32,6 +32,9 @@ import static org.mockito.Mockito.verify;
 
 import java.io.ByteArrayInputStream;
 import java.io.InputStream;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.TimeZone;
 
 import org.apache.james.mime4j.stream.MimeConfig;
 import org.apache.tika.TikaTest;
@@ -392,4 +395,16 @@ public class RFC822ParserTest extends TikaTest {
         r = getXML("testRFC822_eml");
         assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
     }
+
+    @Test
+    public void testDates() throws Exception {
+
+        //tests non-standard dates that mime4j can't parse
+        XMLResult r = getXML("testRFC822_date_utf8");
+        assertEquals("2016-05-16T08:30:32Z", 
r.metadata.get(TikaCoreProperties.CREATED));
+
+        r = getXML("testRFC822_eml");
+        assertEquals("2016-05-16T08:30:32Z", 
r.metadata.get(TikaCoreProperties.CREATED));
+    }
+
 }

Reply via email to