Repository: tika Updated Branches: refs/heads/2.x 40c5f3bfe -> 45cc8dd9c
TIKA-1976: improve date parsing for rfc822parser Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/45cc8dd9 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/45cc8dd9 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/45cc8dd9 Branch: refs/heads/2.x Commit: 45cc8dd9c6f188e3b5650e38ea21d77ff7cfc3f6 Parents: 40c5f3b Author: tballison <[email protected]> Authored: Wed May 18 11:54:28 2016 -0400 Committer: tballison <[email protected]> Committed: Wed May 18 11:54:28 2016 -0400 ---------------------------------------------------------------------- .../tika/parser/mail/MailContentHandler.java | 53 ++++++++++++- .../tika/parser/mail/RFC822ParserTest.java | 83 ++++++++++++++++++-- 2 files changed, 128 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/45cc8dd9/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java index 7cf8886..9740eff 100644 --- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java +++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java @@ -55,6 +55,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; +import static org.apache.tika.utils.DateUtils.MIDDAY; import static org.apache.tika.utils.DateUtils.UTC; /** @@ -68,17 +69,56 @@ class MailContentHandler implements ContentHandler { private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN = Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z"); + //find a time ending in am/pm without a space: 10:30am and + //use this pattern to insert space: 10:30 am + private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b"); + private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] { - //16 May 2016 at 09:30:32 GMT+1 - createDateFormat("dd MMM yyyy 'at' HH:mm:ss z", UTC), // UTC/Zulu + //note that the string is "cleaned" before processing: + //1) condense multiple whitespace to single space + //2) trim() + //3) strip out commas + //4) insert space before am/pm + + //May 16 2016 1:32am + createDateFormat("MMM dd yy hh:mm a", null), + + //this is a standard pattern handled by mime4j; + //but mime4j fails with leading whitespace + createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC), + + createDateFormat("EEE d MMM yy HH:mm:ss z", UTC), + + createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone + + createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM + + //16 May 2016 at 09:30:32 GMT+1 (Mac Mail TIKA-1970) + createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC), // UTC/Zulu + + createDateFormat("yy-MM-dd HH:mm:ss", null), + + createDateFormat("MM/dd/yy hh:mm a", null, false), + + //now dates without times + createDateFormat("MMM d yy", MIDDAY, false), + createDateFormat("EEE d MMM yy", MIDDAY, false), + createDateFormat("d MMM yy", MIDDAY, false), + createDateFormat("yy/MM/dd", MIDDAY, false), + createDateFormat("MM/dd/yy", MIDDAY, false) }; private static DateFormat createDateFormat(String format, TimeZone timezone) { + return createDateFormat(format, timezone, true); + } + + private static DateFormat createDateFormat(String format, TimeZone timezone, boolean isLenient) { SimpleDateFormat sdf = new SimpleDateFormat(format, new DateFormatSymbols(Locale.US)); if (timezone != null) { sdf.setTimeZone(timezone); } + sdf.setLenient(isLenient); return sdf; } @@ -244,12 +284,21 @@ class MailContentHandler implements ContentHandler { if (text == null) { return null; } + //strip out additional spaces and trim text = text.replaceAll("\\s+", " ").trim(); + + //strip out commas + text = text.replaceAll(",", ""); Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text); if (matcher.find()) { text = matcher.replaceFirst("GMT$1$2:00"); } + matcher = AM_PM.matcher(text); + if (matcher.find()) { + text = matcher.replaceFirst("$1 $2"); + } + for (DateFormat format : ALTERNATE_DATE_FORMATS) { try { return format.parse(text); http://git-wip-us.apache.org/repos/asf/tika/blob/45cc8dd9/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java index b15eac1..a327c49 100644 --- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java +++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java @@ -17,11 +17,7 @@ package org.apache.tika.parser.mail; import static java.nio.charset.StandardCharsets.US_ASCII; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; +import static org.junit.Assert.*; import static org.junit.Assume.assumeTrue; import static org.mockito.Matchers.any; import static org.mockito.Matchers.eq; @@ -31,10 +27,14 @@ import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import java.io.ByteArrayInputStream; +import java.io.File; import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.text.DateFormat; +import java.text.DateFormatSymbols; import java.text.SimpleDateFormat; import java.util.Date; +import java.util.Locale; import org.apache.james.mime4j.stream.MimeConfig; import org.apache.tika.TikaTest; @@ -390,13 +390,84 @@ public class RFC822ParserTest extends TikaTest { @Test public void testDates() throws Exception { - //tests non-standard dates that mime4j can't parse XMLResult r = getXML("testRFC822_date_utf8"); assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED)); r = getXML("testRFC822_eml"); assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED)); + + + String expected = "2016-05-15T01:32:00Z"; + + for (String dateString : new String[]{ + "Sun, 15 May 2016 01:32:00 UTC", //make sure this test basically works + "Sun, 15 May 2016 01:32:00", //no timezone + "Sunday, May 15 2016 1:32 AM", + "May 15 2016 1:32am", + "May 15 2016 1:32 am", + "2016-05-15 01:32:00", + " Sun, 15 May 2016 3:32:00 +0200",//format correctly handled by mime4j if no leading whitespace + " Sun, 14 May 2016 20:32:00 EST", + }) { + testDate(dateString, expected); + } + + //now try days without times + expected = "2016-05-15T12:00:00Z"; + for (String dateString : new String[]{ + "May 15, 2016", + "Sun, 15 May 2016", + "15 May 2016", + }) { + testDate(dateString, expected); + } + } + + @Test + public void testTrickyDates() throws Exception { + DateFormat df = new SimpleDateFormat("yyyy-MM-dd", new DateFormatSymbols(Locale.US)); + //make sure there are no mis-parses of e.g. 90 = year 90 A.D, not 1990 + Date date1980 = df.parse("1980-01-01"); + for (String dateString : new String[] { + "Mon, 29 Jan 96 14:02 GMT", + "7/20/95 1:12pm", + "08/14/2000 12:48 AM", + "06/24/2008, Tuesday, 11 AM", + "11/14/08", + "12/02/1996", + "96/12/02", + }) { + Date parsedDate = getDate(dateString); + if (parsedDate != null) { + assertTrue("date must be after 1980:"+dateString, parsedDate.getTime() > date1980.getTime()); + } + } + //TODO: mime4j misparses these to pre 1980 dates + //"Wed, 27 Dec 95 11:20:40 EST", + //"26 Aug 00 11:14:52 EDT" + // + //We are still misparsing: 8/1/03 to a pre 1980 date + + } + + private void testDate(String dateString, String expected) throws Exception { + Date parsedDate = getDate(dateString); + assertNotNull("couldn't parse " + dateString, parsedDate); + DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); + String parsedDateString = df.format(parsedDate); + assertEquals("failed to match: "+dateString, expected, parsedDateString); + } + + private Date getDate(String dateString) throws Exception { + String mail = "From: [email protected]\n"+ + "Date: "+dateString+"\n"; + Parser p = new RFC822Parser(); + Metadata m = new Metadata(); + try (InputStream is = TikaInputStream.get(mail.getBytes(StandardCharsets.UTF_8))) { + p.parse(is, new DefaultHandler(), m, new ParseContext()); + } + return m.getDate(TikaCoreProperties.CREATED); } @Test
