Repository: tika Updated Branches: refs/heads/master 4a324ff94 -> 534347d0f
TIKA-1976 improve date parsing in rfc882 parser Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/534347d0 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/534347d0 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/534347d0 Branch: refs/heads/master Commit: 534347d0f095bd89409140db00927a532fb53213 Parents: 4a324ff Author: tballison <[email protected]> Authored: Wed May 18 11:58:40 2016 -0400 Committer: tballison <[email protected]> Committed: Wed May 18 11:58:40 2016 -0400 ---------------------------------------------------------------------- .../tika/parser/mail/MailContentHandler.java | 52 ++++++++++++- .../tika/parser/mail/RFC822ParserTest.java | 77 +++++++++++++++++++- 2 files changed, 126 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/534347d0/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java index acdd28c..8d16961 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java @@ -58,6 +58,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; +import static org.apache.tika.utils.DateUtils.MIDDAY; import static org.apache.tika.utils.DateUtils.UTC; /** @@ -71,17 +72,56 @@ class MailContentHandler implements ContentHandler { private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN = Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z"); + //find a time ending in am/pm without a space: 10:30am and + //use this pattern to insert space: 10:30 am + private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b"); + private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] { - //16 May 2016 at 09:30:32 GMT+1 - createDateFormat("dd MMM yyyy 'at' HH:mm:ss z", UTC), // UTC/Zulu + //note that the string is "cleaned" before processing: + //1) condense multiple whitespace to single space + //2) trim() + //3) strip out commas + //4) insert space before am/pm + + //May 16 2016 1:32am + createDateFormat("MMM dd yy hh:mm a", null), + + //this is a standard pattern handled by mime4j; + //but mime4j fails with leading whitespace + createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC), + + createDateFormat("EEE d MMM yy HH:mm:ss z", UTC), + + createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone + + createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM + + //16 May 2016 at 09:30:32 GMT+1 (Mac Mail TIKA-1970) + createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC), // UTC/Zulu + + createDateFormat("yy-MM-dd HH:mm:ss", null), + + createDateFormat("MM/dd/yy hh:mm a", null, false), + + //now dates without times + createDateFormat("MMM d yy", MIDDAY, false), + createDateFormat("EEE d MMM yy", MIDDAY, false), + createDateFormat("d MMM yy", MIDDAY, false), + createDateFormat("yy/MM/dd", MIDDAY, false), + createDateFormat("MM/dd/yy", MIDDAY, false) }; private static DateFormat createDateFormat(String format, TimeZone timezone) { + return createDateFormat(format, timezone, true); + } + + private static DateFormat createDateFormat(String format, TimeZone timezone, boolean isLenient) { SimpleDateFormat sdf = new SimpleDateFormat(format, new DateFormatSymbols(Locale.US)); if (timezone != null) { sdf.setTimeZone(timezone); } + sdf.setLenient(isLenient); return sdf; } @@ -248,11 +288,19 @@ class MailContentHandler implements ContentHandler { return null; } text = text.replaceAll("\\s+", " ").trim(); + //strip out commas + text = text.replaceAll(",", ""); + Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text); if (matcher.find()) { text = matcher.replaceFirst("GMT$1$2:00"); } + matcher = AM_PM.matcher(text); + if (matcher.find()) { + text = matcher.replaceFirst("$1 $2"); + } + for (DateFormat format : ALTERNATE_DATE_FORMATS) { try { return format.parse(text); http://git-wip-us.apache.org/repos/asf/tika/blob/534347d0/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java index 2e177ee..c3dcbb5 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java @@ -33,8 +33,11 @@ import static org.mockito.Mockito.verify; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.text.DateFormat; +import java.text.DateFormatSymbols; import java.text.SimpleDateFormat; import java.util.Date; +import java.util.Locale; import java.util.TimeZone; import org.apache.james.mime4j.stream.MimeConfig; @@ -399,16 +402,88 @@ public class RFC822ParserTest extends TikaTest { @Test public void testDates() throws Exception { - //tests non-standard dates that mime4j can't parse XMLResult r = getXML("testRFC822_date_utf8"); assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED)); r = getXML("testRFC822_eml"); assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED)); + + + String expected = "2016-05-15T01:32:00Z"; + + for (String dateString : new String[]{ + "Sun, 15 May 2016 01:32:00 UTC", //make sure this test basically works + "Sun, 15 May 2016 01:32:00", //no timezone + "Sunday, May 15 2016 1:32 AM", + "May 15 2016 1:32am", + "May 15 2016 1:32 am", + "2016-05-15 01:32:00", + " Sun, 15 May 2016 3:32:00 +0200",//format correctly handled by mime4j if no leading whitespace + " Sun, 14 May 2016 20:32:00 EST", + }) { + testDate(dateString, expected); + } + + //now try days without times + expected = "2016-05-15T12:00:00Z"; + for (String dateString : new String[]{ + "May 15, 2016", + "Sun, 15 May 2016", + "15 May 2016", + }) { + testDate(dateString, expected); + } } @Test + public void testTrickyDates() throws Exception { + DateFormat df = new SimpleDateFormat("yyyy-MM-dd", new DateFormatSymbols(Locale.US)); + //make sure there are no mis-parses of e.g. 90 = year 90 A.D, not 1990 + Date date1980 = df.parse("1980-01-01"); + for (String dateString : new String[] { + "Mon, 29 Jan 96 14:02 GMT", + "7/20/95 1:12pm", + "08/14/2000 12:48 AM", + "06/24/2008, Tuesday, 11 AM", + "11/14/08", + "12/02/1996", + "96/12/02", + }) { + Date parsedDate = getDate(dateString); + if (parsedDate != null) { + assertTrue("date must be after 1980:"+dateString, parsedDate.getTime() > date1980.getTime()); + } + } + //TODO: mime4j misparses these to pre 1980 dates + //"Wed, 27 Dec 95 11:20:40 EST", + //"26 Aug 00 11:14:52 EDT" + // + //We are still misparsing: 8/1/03 to a pre 1980 date + + } + + private void testDate(String dateString, String expected) throws Exception { + Date parsedDate = getDate(dateString); + assertNotNull("couldn't parse " + dateString, parsedDate); + DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); + String parsedDateString = df.format(parsedDate); + assertEquals("failed to match: "+dateString, expected, parsedDateString); + } + + private Date getDate(String dateString) throws Exception { + String mail = "From: [email protected]\n"+ + "Date: "+dateString+"\n"; + Parser p = new RFC822Parser(); + Metadata m = new Metadata(); + try (InputStream is = TikaInputStream.get(mail.getBytes(StandardCharsets.UTF_8))) { + p.parse(is, new DefaultHandler(), m, new ParseContext()); + } + return m.getDate(TikaCoreProperties.CREATED); + } + + + @Test public void testMultipleSubjects() throws Exception { //adapted from govdocs1 303710.txt String s = "From: Shawn Jones [[email protected]]\n" +
