Repository: tika
Updated Branches:
  refs/heads/master 4a324ff94 -> 534347d0f


TIKA-1976 improve date parsing in rfc882 parser


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/534347d0
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/534347d0
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/534347d0

Branch: refs/heads/master
Commit: 534347d0f095bd89409140db00927a532fb53213
Parents: 4a324ff
Author: tballison <[email protected]>
Authored: Wed May 18 11:58:40 2016 -0400
Committer: tballison <[email protected]>
Committed: Wed May 18 11:58:40 2016 -0400

----------------------------------------------------------------------
 .../tika/parser/mail/MailContentHandler.java    | 52 ++++++++++++-
 .../tika/parser/mail/RFC822ParserTest.java      | 77 +++++++++++++++++++-
 2 files changed, 126 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/534347d0/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index acdd28c..8d16961 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -58,6 +58,7 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
+import static org.apache.tika.utils.DateUtils.MIDDAY;
 import static org.apache.tika.utils.DateUtils.UTC;
 
 /**
@@ -71,17 +72,56 @@ class MailContentHandler implements ContentHandler {
     private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
             Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
 
+    //find a time ending in am/pm without a space: 10:30am and
+    //use this pattern to insert space: 10:30 am
+    private static final Pattern AM_PM = 
Pattern.compile("(?i)(\\d)([ap]m)\\b");
+
     private static final DateFormat[] ALTERNATE_DATE_FORMATS = new 
DateFormat[] {
-            //16 May 2016 at 09:30:32  GMT+1
-            createDateFormat("dd MMM yyyy 'at' HH:mm:ss z", UTC),   // UTC/Zulu
+            //note that the string is "cleaned" before processing:
+            //1) condense multiple whitespace to single space
+            //2) trim()
+            //3) strip out commas
+            //4) insert space before am/pm
+
+            //May 16 2016 1:32am
+            createDateFormat("MMM dd yy hh:mm a", null),
+
+            //this is a standard pattern handled by mime4j;
+            //but mime4j fails with leading whitespace
+            createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC),
+
+            createDateFormat("EEE d MMM yy HH:mm:ss z", UTC),
+
+            createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone
+
+            createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 
2016 1:32 PM
+
+            //16 May 2016 at 09:30:32  GMT+1 (Mac Mail TIKA-1970)
+            createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC),   // UTC/Zulu
+
+            createDateFormat("yy-MM-dd HH:mm:ss", null),
+
+            createDateFormat("MM/dd/yy hh:mm a", null, false),
+
+            //now dates without times
+            createDateFormat("MMM d yy", MIDDAY, false),
+            createDateFormat("EEE d MMM yy", MIDDAY, false),
+            createDateFormat("d MMM yy", MIDDAY, false),
+            createDateFormat("yy/MM/dd", MIDDAY, false),
+            createDateFormat("MM/dd/yy", MIDDAY, false)
     };
 
     private static DateFormat createDateFormat(String format, TimeZone 
timezone) {
+        return createDateFormat(format, timezone, true);
+    }
+
+    private static DateFormat createDateFormat(String format, TimeZone 
timezone, boolean isLenient) {
         SimpleDateFormat sdf =
                 new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
         if (timezone != null) {
             sdf.setTimeZone(timezone);
         }
+        sdf.setLenient(isLenient);
         return sdf;
     }
 
@@ -248,11 +288,19 @@ class MailContentHandler implements ContentHandler {
             return null;
         }
         text = text.replaceAll("\\s+", " ").trim();
+        //strip out commas
+        text = text.replaceAll(",", "");
+
         Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text);
         if (matcher.find()) {
             text = matcher.replaceFirst("GMT$1$2:00");
         }
 
+        matcher = AM_PM.matcher(text);
+        if (matcher.find()) {
+            text = matcher.replaceFirst("$1 $2");
+        }
+
         for (DateFormat format : ALTERNATE_DATE_FORMATS) {
             try {
                 return format.parse(text);

http://git-wip-us.apache.org/repos/asf/tika/blob/534347d0/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index 2e177ee..c3dcbb5 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -33,8 +33,11 @@ import static org.mockito.Mockito.verify;
 import java.io.ByteArrayInputStream;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
+import java.text.DateFormat;
+import java.text.DateFormatSymbols;
 import java.text.SimpleDateFormat;
 import java.util.Date;
+import java.util.Locale;
 import java.util.TimeZone;
 
 import org.apache.james.mime4j.stream.MimeConfig;
@@ -399,16 +402,88 @@ public class RFC822ParserTest extends TikaTest {
 
     @Test
     public void testDates() throws Exception {
-
         //tests non-standard dates that mime4j can't parse
         XMLResult r = getXML("testRFC822_date_utf8");
         assertEquals("2016-05-16T08:30:32Z", 
r.metadata.get(TikaCoreProperties.CREATED));
 
         r = getXML("testRFC822_eml");
         assertEquals("2016-05-16T08:30:32Z", 
r.metadata.get(TikaCoreProperties.CREATED));
+
+
+        String expected = "2016-05-15T01:32:00Z";
+
+        for (String dateString : new String[]{
+                "Sun, 15 May 2016 01:32:00 UTC", //make sure this test 
basically works
+                "Sun, 15 May 2016 01:32:00", //no timezone
+                "Sunday, May 15 2016 1:32 AM",
+                "May 15 2016 1:32am",
+                "May 15 2016 1:32 am",
+                "2016-05-15 01:32:00",
+                "      Sun, 15 May 2016 3:32:00 +0200",//format correctly 
handled by mime4j if no leading whitespace
+                "      Sun, 14 May 2016 20:32:00 EST",
+        }) {
+            testDate(dateString, expected);
+        }
+
+        //now try days without times
+        expected = "2016-05-15T12:00:00Z";
+        for (String dateString : new String[]{
+                "May 15, 2016",
+                "Sun, 15 May 2016",
+                "15 May 2016",
+        }) {
+            testDate(dateString, expected);
+        }
     }
 
     @Test
+    public void testTrickyDates() throws Exception {
+        DateFormat df = new SimpleDateFormat("yyyy-MM-dd", new 
DateFormatSymbols(Locale.US));
+        //make sure there are no mis-parses of e.g. 90 = year 90 A.D, not 1990
+        Date date1980 = df.parse("1980-01-01");
+        for (String dateString : new String[] {
+                "Mon, 29 Jan 96 14:02 GMT",
+                "7/20/95 1:12pm",
+                "08/14/2000  12:48 AM",
+                "06/24/2008, Tuesday, 11 AM",
+                "11/14/08",
+                "12/02/1996",
+                "96/12/02",
+        }) {
+            Date parsedDate = getDate(dateString);
+            if (parsedDate != null) {
+                assertTrue("date must be after 1980:"+dateString, 
parsedDate.getTime() > date1980.getTime());
+            }
+        }
+        //TODO: mime4j misparses these to pre 1980 dates
+        //"Wed, 27 Dec 95 11:20:40 EST",
+        //"26 Aug 00 11:14:52 EDT"
+        //
+        //We are still misparsing: 8/1/03 to a pre 1980 date
+
+    }
+
+    private void testDate(String dateString, String expected) throws Exception 
{
+        Date parsedDate = getDate(dateString);
+        assertNotNull("couldn't parse " + dateString, parsedDate);
+        DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
+        String parsedDateString = df.format(parsedDate);
+        assertEquals("failed to match: "+dateString, expected, 
parsedDateString);
+    }
+
+    private Date getDate(String dateString) throws Exception {
+        String mail = "From: [email protected]\n"+
+                "Date: "+dateString+"\n";
+        Parser p = new RFC822Parser();
+        Metadata m = new Metadata();
+        try (InputStream is = 
TikaInputStream.get(mail.getBytes(StandardCharsets.UTF_8))) {
+            p.parse(is, new DefaultHandler(), m, new ParseContext());
+        }
+        return m.getDate(TikaCoreProperties.CREATED);
+    }
+
+
+    @Test
     public void testMultipleSubjects() throws Exception {
         //adapted from govdocs1 303710.txt
         String s = "From: Shawn Jones [[email protected]]\n" +

Reply via email to