This is an automated email from the ASF dual-hosted git repository. rcordier pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/james-project.git
commit ef87f69290a724e9f6065173f91ad051e6268d02 Author: Benoit Tellier <[email protected]> AuthorDate: Tue Feb 4 16:27:35 2020 +0700 MAILBOX-395 ElasticSearch indexing should not fail upon invalid charset --- .../mailbox/elasticsearch/json/MimePartParser.java | 17 +++++++--- .../json/MessageToElasticSearchJsonTest.java | 22 ++++++++++++ .../src/test/resources/eml/invalidCharset.eml | 10 ++++++ .../src/test/resources/eml/invalidCharset.json | 39 ++++++++++++++++++++++ 4 files changed, 84 insertions(+), 4 deletions(-) diff --git a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java index 7cd6e3a..a87ae44 100644 --- a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java +++ b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java @@ -33,10 +33,13 @@ import org.apache.james.mime4j.message.MaximalBodyDescriptor; import org.apache.james.mime4j.stream.EntityState; import org.apache.james.mime4j.stream.MimeConfig; import org.apache.james.mime4j.stream.MimeTokenStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.google.common.base.Preconditions; public class MimePartParser { + private static final Logger LOGGER = LoggerFactory.getLogger(MimePartParser.class); private final Message message; private final TextExtractor textExtractor; @@ -120,10 +123,16 @@ public class MimePartParser { .addSubType(descriptor.getSubType()) .addContentDisposition(descriptor.getContentDispositionType()) .addFileName(descriptor.getContentDispositionFilename()); - - Optional.ofNullable(descriptor.getCharset()) - .map(Charset::forName) - .ifPresent(currentlyBuildMimePart::charset); + extractCharset(descriptor); } + private void extractCharset(MaximalBodyDescriptor descriptor) { + try { + Optional.ofNullable(descriptor.getCharset()) + .map(Charset::forName) + .ifPresent(currentlyBuildMimePart::charset); + } catch (Exception e) { + LOGGER.info("Failed parsing charset", e); + } + } } diff --git a/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java index 8586c1a..7c7f5ba 100644 --- a/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java +++ b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java @@ -110,6 +110,28 @@ class MessageToElasticSearchJsonTest { } @Test + void invalidCharsetShouldBeWellConvertedToJson() throws IOException { + MessageToElasticSearchJson messageToElasticSearchJson = new MessageToElasticSearchJson( + new DefaultTextExtractor(), + ZoneId.of("Europe/Paris"), IndexAttachments.YES); + MailboxMessage spamMail = new SimpleMailboxMessage(MESSAGE_ID, + date, + SIZE, + BODY_START_OCTET, + ClassLoaderUtils.getSystemResourceAsSharedStream("eml/invalidCharset.eml"), + new Flags(), + propertyBuilder, + MAILBOX_ID); + spamMail.setUid(UID); + spamMail.setModSeq(MOD_SEQ); + + String actual = messageToElasticSearchJson.convertToJson(spamMail, ImmutableList.of(USERNAME)); + assertThatJson(actual) + .when(IGNORING_ARRAY_ORDER) + .isEqualTo(ClassLoaderUtils.getSystemResourceAsString("eml/invalidCharset.json")); + } + + @Test void htmlEmailShouldBeWellConvertedToJson() throws IOException { MessageToElasticSearchJson messageToElasticSearchJson = new MessageToElasticSearchJson( new DefaultTextExtractor(), diff --git a/mailbox/store/src/test/resources/eml/invalidCharset.eml b/mailbox/store/src/test/resources/eml/invalidCharset.eml new file mode 100644 index 0000000..62bc3fb --- /dev/null +++ b/mailbox/store/src/test/resources/eml/invalidCharset.eml @@ -0,0 +1,10 @@ +To: Antoine DUPRAT <[email protected]> +From: Antoine DUPRAT <[email protected]> +Subject: Inline attachment +Message-ID: <[email protected]> +Date: Tue, 5 Jul 2016 11:47:46 +0200 +MIME-Version: 1.0 +Content-Type: text/plain; charset=%invalid; format=flowed +Content-Transfer-Encoding: 7bit + +This is an inline attachment: Cheers! \ No newline at end of file diff --git a/mailbox/store/src/test/resources/eml/invalidCharset.json b/mailbox/store/src/test/resources/eml/invalidCharset.json new file mode 100644 index 0000000..eed4184 --- /dev/null +++ b/mailbox/store/src/test/resources/eml/invalidCharset.json @@ -0,0 +1,39 @@ +{ + "attachments":[], + "bcc":[], + "htmlBody":null, + "textBody":"This is an inline attachment: Cheers!", + "cc":[], + "date":"2015-06-07T00:00:00+0200", + "from":[{"name":"Antoine DUPRAT","address":"[email protected]"}], + "hasAttachment":false, + "headers":[ + {"name":"to","value":"Antoine DUPRAT <[email protected]>"}, + {"name":"from","value":"Antoine DUPRAT <[email protected]>"}, + {"name":"subject","value":"Inline attachment"}, + {"name":"message-id","value":"<[email protected]>"}, + {"name":"date","value":"Tue, 5 Jul 2016 11:47:46 +0200"}, + {"name":"mime-version","value":"1.0"}, + {"name":"content-type","value":"text/plain; charset=%invalid; format=flowed"}, + {"name":"content-transfer-encoding","value":"7bit"} + ], + "mailboxId":"18", + "mediaType":"plain", + "messageId":"184", + "modSeq":42, + "sentDate":"2016-07-05T11:47:46+0200", + "size":25, + "subject":["Inline attachment"], + "subtype":"text", + "text":"Antoine DUPRAT [email protected] Antoine DUPRAT [email protected] Inline attachment This is an inline attachment: Cheers!", + "to":[{"name":"Antoine DUPRAT","address":"[email protected]"}], + "uid":25, + "userFlags":[], + "mimeMessageID":"<[email protected]>", + "isAnswered":false, + "isDeleted":false, + "isDraft":false, + "isFlagged":false, + "isRecent":false, + "isUnread":true +} --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
