Author: btellier Date: Mon Jun 29 08:39:30 2015 New Revision: 1688139 URL: http://svn.apache.org/r1688139 Log: MAILBOX-234 Dates extraction from headers
Modified: james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json Modified: james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java?rev=1688139&r1=1688138&r2=1688139&view=diff ============================================================================== --- james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java (original) +++ james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java Mon Jun 29 08:39:30 2015 @@ -19,6 +19,7 @@ package org.apache.james.mailbox.elasticsearch.json; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ImmutableMultimap; @@ -39,6 +40,8 @@ import java.time.format.DateTimeFormatte import java.util.HashSet; import java.util.Optional; import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -46,6 +49,14 @@ public class HeaderCollection { public static class Builder { + // Some sent e-mail have this form : Wed, 3 Jun 2015 09:05:46 +0000 (UTC) + // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed, 3 Jun 2015 09:05:46 +0000 only + // This REGEXP is here to match ( in order to remove ) the possible invalid end of a header date + // Example of matching patterns : + // (UTC) + // (CEST) + private static final Pattern DATE_SANITIZING_PATTERN = Pattern.compile(" *\\(.*\\) *"); + private final Set<EMailer> toAddressSet; private final Set<EMailer> fromAddressSet; private final Set<EMailer> ccAddressSet; @@ -135,13 +146,26 @@ public class HeaderCollection { private Optional<ZonedDateTime> toISODate(String value) { try { - return Optional.of(ZonedDateTime.parse(value, DateTimeFormatter.RFC_1123_DATE_TIME)); + return Optional.of(ZonedDateTime.parse( + sanitizeDateStringHeaderValue(value), + DateTimeFormatter.RFC_1123_DATE_TIME)); } catch (Exception e) { LOGGER.info("Can not parse receive date " + value); return Optional.empty(); } } + @VisibleForTesting String sanitizeDateStringHeaderValue(String value) { + // Some sent e-mail have this form : Wed, 3 Jun 2015 09:05:46 +0000 (UTC) + // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed, 3 Jun 2015 09:05:46 +0000 only + // This method is here to convert the first date into something parsable by RFC_1123_DATE_TIME DateTimeFormatter + Matcher sanitizerMatcher = DATE_SANITIZING_PATTERN.matcher(value); + if (sanitizerMatcher.find()) { + return value.substring(0 , sanitizerMatcher.start()); + } + return value; + } + } public static final String TO = "to"; Modified: james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java?rev=1688139&r1=1688138&r2=1688139&view=diff ============================================================================== --- james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java (original) +++ james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java Mon Jun 29 08:39:30 2015 @@ -130,6 +130,13 @@ public class HeaderCollectionTest { } @Test + public void nonStandardDatesShouldBeRetreived() { + HeaderCollection headerCollection = HeaderCollection.builder().add(new FieldImpl("Date", "Thu, 4 Jun 2015 06:08:41 +0200 (UTC)")).build(); + assertThat(DATE_TIME_FORMATTER.format(headerCollection.getSentDate().get())) + .isEqualTo("2015/06/04 06:08:41"); + } + + @Test public void dateShouldBeAbsentOnInvalidHeader() { HeaderCollection headerCollection = HeaderCollection.builder().add(new FieldImpl("Date", "Not a date")).build(); assertThat(headerCollection.getSentDate().isPresent()) @@ -148,4 +155,32 @@ public class HeaderCollectionTest { HeaderCollection.builder().add(null).build(); } + @Test + public void sanitizeDateStringHeaderValueShouldRemoveCESTPart() { + assertThat(HeaderCollection.builder() + .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200 (CEST)")) + .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200"); + } + + @Test + public void sanitizeDateStringHeaderValueShouldRemoveUTCPart() { + assertThat(HeaderCollection.builder() + .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200 (UTC) ")) + .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200"); + } + + @Test + public void sanitizeDateStringHeaderValueShouldNotChangeAcceptableString() { + assertThat(HeaderCollection.builder() + .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200")) + .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200"); + } + + @Test + public void sanitizeDateStringHeaderValueShouldNotChangeEmptyString() { + assertThat(HeaderCollection.builder() + .sanitizeDateStringHeaderValue("")) + .isEqualTo(""); + } + } Modified: james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml?rev=1688139&r1=1688138&r2=1688139&view=diff ============================================================================== --- james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml (original) +++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml Mon Jun 29 08:39:30 2015 @@ -32,7 +32,7 @@ Delivered-To: mailing list server-dev@ja Received: (qmail 37236 invoked by uid 99); 4 Jun 2015 09:23:38 -0000 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 04 Jun 2015 09:23:38 +0000 -Date: Thu, 4 Jun 2015 09:23:37 +0000 +Date: Thu, 4 Jun 2015 09:23:37 +0000 (UTC) From: "Tellier Benoit (JIRA)" <j...@apache.org> To: server-dev@james.apache.org Message-ID: <jira.12835341.1433409792000.9340.1433409817...@atlassian.jira> Modified: james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml?rev=1688139&r1=1688138&r2=1688139&view=diff ============================================================================== --- james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml (original) +++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml Mon Jun 29 08:39:30 2015 @@ -32,7 +32,7 @@ Delivered-To: mailing list server-dev@ja Received: (qmail 43130 invoked by uid 99); 4 Jun 2015 09:27:38 -0000 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 04 Jun 2015 09:27:38 +0000 -Date: Thu, 4 Jun 2015 09:27:37 +0000 +Date: Thu, 4 Jun 2015 09:27:37 +0000 (UTC) From: "Tellier Benoit (JIRA)" <j...@apache.org> To: server-dev@james.apache.org Message-ID: <jira.12781874.1426269127000.9353.1433410057...@atlassian.jira> Modified: james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml?rev=1688139&r1=1688138&r2=1688139&view=diff ============================================================================== --- james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml (original) +++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml Mon Jun 29 08:39:30 2015 @@ -34,7 +34,7 @@ Delivered-To: mailing list server-dev@ja Received: (qmail 1132 invoked by uid 99); 2 Jun 2015 08:16:20 -0000 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 02 Jun 2015 08:16:20 +0000 -Date: Tue, 2 Jun 2015 08:16:19 +0000 +Date: Tue, 2 Jun 2015 08:16:19 +0000 (UTC) From: "Eric Charles (JIRA)" <j...@apache.org> To: server-dev@james.apache.org Message-ID: <jira.12473940.1284322083000.91735.1433232979...@atlassian.jira> Modified: james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml?rev=1688139&r1=1688138&r2=1688139&view=diff ============================================================================== --- james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml (original) +++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml Mon Jun 29 08:39:30 2015 @@ -34,7 +34,7 @@ Delivered-To: mailing list mailet-api@ja Received: (qmail 81730 invoked by uid 99); 15 May 2015 06:36:00 -0000 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 15 May 2015 06:36:00 +0000 -Date: Fri, 15 May 2015 06:35:59 +0000 +Date: Fri, 15 May 2015 06:35:59 +0000 (UTC) From: "Eric Charles (JIRA)" <mailet-...@james.apache.org> To: mailet-...@james.apache.org Message-ID: <jira.12825882.1430301328000.124152.1431671759...@atlassian.jira> Modified: james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml?rev=1688139&r1=1688138&r2=1688139&view=diff ============================================================================== --- james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml (original) +++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml Mon Jun 29 08:39:30 2015 @@ -31,7 +31,7 @@ Content-Type: multipart/mixed; boundary= Content-Transfer-Encoding: 7bit MIME-Version: 1.0 From: "Content-filter at spam.minet.net" <postmas...@minet.net> -Date: Wed, 3 Jun 2015 09:05:46 +0000 +Date: Wed, 3 Jun 2015 09:05:46 +0000 (UTC) To: <r...@listes.minet.net> Message-ID: <vass-izaxqm...@spam.minet.net> Subject: [root] UNCHECKED contents in mail FROM <quenti...@riseup.net> Modified: james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json?rev=1688139&r1=1688138&r2=1688139&view=diff ============================================================================== --- james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json (original) +++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json Mon Jun 29 08:39:30 2015 @@ -12,7 +12,7 @@ "1.0" ], "date": [ - "Wed, 3 Jun 2015 09:05:46 +0000" + "Wed, 3 Jun 2015 09:05:46 +0000 (UTC)" ], "x-beenthere": [ "r...@listes.minet.net" @@ -103,7 +103,7 @@ "subject": [ "[root] UNCHECKED contents in mail FROM <quenti...@riseup.net>" ], - "sentDate": "2015-06-07T00:00:00+0200", + "sentDate": "2015-06-03T09:05:46+0000", "properties": [ { "namespace": "http://james.apache.org/rfc2045/Content-Type", --------------------------------------------------------------------- To unsubscribe, e-mail: server-dev-unsubscr...@james.apache.org For additional commands, e-mail: server-dev-h...@james.apache.org