Repository: tika
Updated Branches:
  refs/heads/2.x 40c5f3bfe -> 45cc8dd9c


TIKA-1976: improve date parsing for rfc822parser


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/45cc8dd9
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/45cc8dd9
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/45cc8dd9

Branch: refs/heads/2.x
Commit: 45cc8dd9c6f188e3b5650e38ea21d77ff7cfc3f6
Parents: 40c5f3b
Author: tballison <[email protected]>
Authored: Wed May 18 11:54:28 2016 -0400
Committer: tballison <[email protected]>
Committed: Wed May 18 11:54:28 2016 -0400

----------------------------------------------------------------------
 .../tika/parser/mail/MailContentHandler.java    | 53 ++++++++++++-
 .../tika/parser/mail/RFC822ParserTest.java      | 83 ++++++++++++++++++--
 2 files changed, 128 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/45cc8dd9/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
 
b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 7cf8886..9740eff 100644
--- 
a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ 
b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -55,6 +55,7 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
+import static org.apache.tika.utils.DateUtils.MIDDAY;
 import static org.apache.tika.utils.DateUtils.UTC;
 
 /**
@@ -68,17 +69,56 @@ class MailContentHandler implements ContentHandler {
     private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
             Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
 
+    //find a time ending in am/pm without a space: 10:30am and
+    //use this pattern to insert space: 10:30 am
+    private static final Pattern AM_PM = 
Pattern.compile("(?i)(\\d)([ap]m)\\b");
+
     private static final DateFormat[] ALTERNATE_DATE_FORMATS = new 
DateFormat[] {
-            //16 May 2016 at 09:30:32  GMT+1
-            createDateFormat("dd MMM yyyy 'at' HH:mm:ss z", UTC),   // UTC/Zulu
+            //note that the string is "cleaned" before processing:
+            //1) condense multiple whitespace to single space
+            //2) trim()
+            //3) strip out commas
+            //4) insert space before am/pm
+
+            //May 16 2016 1:32am
+            createDateFormat("MMM dd yy hh:mm a", null),
+
+            //this is a standard pattern handled by mime4j;
+            //but mime4j fails with leading whitespace
+            createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC),
+
+            createDateFormat("EEE d MMM yy HH:mm:ss z", UTC),
+
+            createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone
+
+            createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 
2016 1:32 PM
+
+            //16 May 2016 at 09:30:32  GMT+1 (Mac Mail TIKA-1970)
+            createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC),   // UTC/Zulu
+
+            createDateFormat("yy-MM-dd HH:mm:ss", null),
+
+            createDateFormat("MM/dd/yy hh:mm a", null, false),
+
+            //now dates without times
+            createDateFormat("MMM d yy", MIDDAY, false),
+            createDateFormat("EEE d MMM yy", MIDDAY, false),
+            createDateFormat("d MMM yy", MIDDAY, false),
+            createDateFormat("yy/MM/dd", MIDDAY, false),
+            createDateFormat("MM/dd/yy", MIDDAY, false)
     };
 
     private static DateFormat createDateFormat(String format, TimeZone 
timezone) {
+        return createDateFormat(format, timezone, true);
+    }
+
+    private static DateFormat createDateFormat(String format, TimeZone 
timezone, boolean isLenient) {
         SimpleDateFormat sdf =
                 new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
         if (timezone != null) {
             sdf.setTimeZone(timezone);
         }
+        sdf.setLenient(isLenient);
         return sdf;
     }
 
@@ -244,12 +284,21 @@ class MailContentHandler implements ContentHandler {
         if (text == null) {
             return null;
         }
+        //strip out additional spaces and trim
         text = text.replaceAll("\\s+", " ").trim();
+
+        //strip out commas
+        text = text.replaceAll(",", "");
         Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text);
         if (matcher.find()) {
             text = matcher.replaceFirst("GMT$1$2:00");
         }
 
+        matcher = AM_PM.matcher(text);
+        if (matcher.find()) {
+            text = matcher.replaceFirst("$1 $2");
+        }
+
         for (DateFormat format : ALTERNATE_DATE_FORMATS) {
             try {
                 return format.parse(text);

http://git-wip-us.apache.org/repos/asf/tika/blob/45cc8dd9/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
 
b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index b15eac1..a327c49 100644
--- 
a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ 
b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -17,11 +17,7 @@
 package org.apache.tika.parser.mail;
 
 import static java.nio.charset.StandardCharsets.US_ASCII;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
+import static org.junit.Assert.*;
 import static org.junit.Assume.assumeTrue;
 import static org.mockito.Matchers.any;
 import static org.mockito.Matchers.eq;
@@ -31,10 +27,14 @@ import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
 
 import java.io.ByteArrayInputStream;
+import java.io.File;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
+import java.text.DateFormat;
+import java.text.DateFormatSymbols;
 import java.text.SimpleDateFormat;
 import java.util.Date;
+import java.util.Locale;
 
 import org.apache.james.mime4j.stream.MimeConfig;
 import org.apache.tika.TikaTest;
@@ -390,13 +390,84 @@ public class RFC822ParserTest extends TikaTest {
 
     @Test
     public void testDates() throws Exception {
-
         //tests non-standard dates that mime4j can't parse
         XMLResult r = getXML("testRFC822_date_utf8");
         assertEquals("2016-05-16T08:30:32Z", 
r.metadata.get(TikaCoreProperties.CREATED));
 
         r = getXML("testRFC822_eml");
         assertEquals("2016-05-16T08:30:32Z", 
r.metadata.get(TikaCoreProperties.CREATED));
+
+
+        String expected = "2016-05-15T01:32:00Z";
+
+        for (String dateString : new String[]{
+                "Sun, 15 May 2016 01:32:00 UTC", //make sure this test 
basically works
+                "Sun, 15 May 2016 01:32:00", //no timezone
+                "Sunday, May 15 2016 1:32 AM",
+                "May 15 2016 1:32am",
+                "May 15 2016 1:32 am",
+                "2016-05-15 01:32:00",
+                "      Sun, 15 May 2016 3:32:00 +0200",//format correctly 
handled by mime4j if no leading whitespace
+                "      Sun, 14 May 2016 20:32:00 EST",
+        }) {
+            testDate(dateString, expected);
+        }
+
+        //now try days without times
+        expected = "2016-05-15T12:00:00Z";
+        for (String dateString : new String[]{
+                "May 15, 2016",
+                "Sun, 15 May 2016",
+                "15 May 2016",
+        }) {
+            testDate(dateString, expected);
+        }
+    }
+
+    @Test
+    public void testTrickyDates() throws Exception {
+        DateFormat df = new SimpleDateFormat("yyyy-MM-dd", new 
DateFormatSymbols(Locale.US));
+        //make sure there are no mis-parses of e.g. 90 = year 90 A.D, not 1990
+        Date date1980 = df.parse("1980-01-01");
+        for (String dateString : new String[] {
+                "Mon, 29 Jan 96 14:02 GMT",
+                "7/20/95 1:12pm",
+                "08/14/2000  12:48 AM",
+                "06/24/2008, Tuesday, 11 AM",
+                "11/14/08",
+                "12/02/1996",
+                "96/12/02",
+        }) {
+            Date parsedDate = getDate(dateString);
+            if (parsedDate != null) {
+                assertTrue("date must be after 1980:"+dateString, 
parsedDate.getTime() > date1980.getTime());
+            }
+        }
+        //TODO: mime4j misparses these to pre 1980 dates
+        //"Wed, 27 Dec 95 11:20:40 EST",
+        //"26 Aug 00 11:14:52 EDT"
+        //
+        //We are still misparsing: 8/1/03 to a pre 1980 date
+
+    }
+
+    private void testDate(String dateString, String expected) throws Exception 
{
+        Date parsedDate = getDate(dateString);
+        assertNotNull("couldn't parse " + dateString, parsedDate);
+        DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
+        String parsedDateString = df.format(parsedDate);
+        assertEquals("failed to match: "+dateString, expected, 
parsedDateString);
+    }
+
+    private Date getDate(String dateString) throws Exception {
+        String mail = "From: [email protected]\n"+
+                "Date: "+dateString+"\n";
+        Parser p = new RFC822Parser();
+        Metadata m = new Metadata();
+        try (InputStream is = 
TikaInputStream.get(mail.getBytes(StandardCharsets.UTF_8))) {
+            p.parse(is, new DefaultHandler(), m, new ParseContext());
+        }
+        return m.getDate(TikaCoreProperties.CREATED);
     }
 
     @Test

Reply via email to