Repository: james-project Updated Branches: refs/heads/master 30d6d3c30 -> 77ee834cb
JAMES-2005 Respect the body charset when extracting its content Project: http://git-wip-us.apache.org/repos/asf/james-project/repo Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/6fca7292 Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/6fca7292 Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/6fca7292 Branch: refs/heads/master Commit: 6fca72923607d331f753e242334ce2496ab09b4e Parents: 30d6d3c Author: Antoine Duprat <adup...@linagora.com> Authored: Tue Apr 18 11:54:44 2017 +0200 Committer: Antoine Duprat <adup...@linagora.com> Committed: Tue Apr 18 15:49:42 2017 +0200 ---------------------------------------------------------------------- .../util/mime/MessageContentExtractor.java | 10 ++++-- .../util/mime/MessageContentExtractorTest.java | 32 +++++++++++++++++++- .../test/resources/eml/windows1252charset.eml | 12 ++++---- 3 files changed, 45 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/james-project/blob/6fca7292/server/container/util-java8/src/main/java/org/apache/james/util/mime/MessageContentExtractor.java ---------------------------------------------------------------------- diff --git a/server/container/util-java8/src/main/java/org/apache/james/util/mime/MessageContentExtractor.java b/server/container/util-java8/src/main/java/org/apache/james/util/mime/MessageContentExtractor.java index 7f819aa..99c320e 100644 --- a/server/container/util-java8/src/main/java/org/apache/james/util/mime/MessageContentExtractor.java +++ b/server/container/util-java8/src/main/java/org/apache/james/util/mime/MessageContentExtractor.java @@ -20,6 +20,7 @@ package org.apache.james.util.mime; import java.io.IOException; +import java.nio.charset.Charset; import java.util.Objects; import java.util.Optional; import java.util.function.Function; @@ -36,7 +37,6 @@ import org.apache.james.mime4j.dom.TextBody; import com.github.fge.lambdas.Throwing; import com.github.fge.lambdas.functions.ThrowingFunction; -import com.google.common.base.Charsets; public class MessageContentExtractor { @@ -92,7 +92,13 @@ public class MessageContentExtractor { } private Optional<String> asString(TextBody textBody) throws IOException { - return Optional.ofNullable(IOUtils.toString(textBody.getInputStream(), Charsets.UTF_8)); + return Optional.ofNullable(IOUtils.toString(textBody.getInputStream(), charset(Optional.ofNullable(textBody.getMimeCharset())))); + } + + private Charset charset(Optional<String> charset) { + return charset + .map(Charset::forName) + .orElse(org.apache.james.mime4j.Charsets.DEFAULT_CHARSET); } private MessageContent retrieveHtmlAndPlainTextContent(Multipart multipart) throws IOException { http://git-wip-us.apache.org/repos/asf/james-project/blob/6fca7292/server/container/util-java8/src/test/java/org/apache/james/util/mime/MessageContentExtractorTest.java ---------------------------------------------------------------------- diff --git a/server/container/util-java8/src/test/java/org/apache/james/util/mime/MessageContentExtractorTest.java b/server/container/util-java8/src/test/java/org/apache/james/util/mime/MessageContentExtractorTest.java index 9607564..0a4750f 100644 --- a/server/container/util-java8/src/test/java/org/apache/james/util/mime/MessageContentExtractorTest.java +++ b/server/container/util-java8/src/test/java/org/apache/james/util/mime/MessageContentExtractorTest.java @@ -21,6 +21,7 @@ package org.apache.james.util.mime; import static org.assertj.core.api.Assertions.assertThat; import java.io.IOException; +import java.nio.charset.Charset; import java.util.Optional; import javax.mail.internet.MimeMessage; @@ -36,7 +37,6 @@ import org.apache.james.mime4j.message.MessageBuilder; import org.apache.james.mime4j.message.MultipartBuilder; import org.apache.james.mime4j.stream.Field; import org.apache.james.mime4j.util.ByteSequence; -import org.apache.james.util.mime.MessageContentExtractor; import org.apache.james.util.mime.MessageContentExtractor.MessageContent; import org.junit.Before; import org.junit.Test; @@ -488,4 +488,34 @@ public class MessageContentExtractorTest { assertThat(actual).isEqualTo(expected); } + + @Test + public void extractShouldRespectCharsetWhenOtherThanUTF8() throws IOException { + String text = "éééé\r\nèèèè\r\nà à à à "; + Message message = MessageBuilder.create() + .setBody(text, Charset.forName("windows-1252")) + .build(); + MessageContent actual = testee.extract(message); + assertThat(actual.getTextBody()).contains(text); + } + + @Test + public void extractShouldRespectCharsetWhenUTF8() throws IOException { + String text = "éééé\r\nèèèè\r\nà à à à "; + Message message = MessageBuilder.create() + .setBody(text, Charsets.UTF_8) + .build(); + MessageContent actual = testee.extract(message); + assertThat(actual.getTextBody()).contains(text); + } + + @Test + public void extractShouldUseUSASCIIWhenNoCharset() throws IOException { + String text = "éééé\r\nèèèè\r\nà à à à "; + Message message = MessageBuilder.create() + .setBody(text, null) + .build(); + MessageContent actual = testee.extract(message); + assertThat(actual.getTextBody()).contains("????\r\n????\r\n????"); + } } http://git-wip-us.apache.org/repos/asf/james-project/blob/6fca7292/server/protocols/jmap-integration-testing/jmap-integration-testing-common/src/test/resources/eml/windows1252charset.eml ---------------------------------------------------------------------- diff --git a/server/protocols/jmap-integration-testing/jmap-integration-testing-common/src/test/resources/eml/windows1252charset.eml b/server/protocols/jmap-integration-testing/jmap-integration-testing-common/src/test/resources/eml/windows1252charset.eml index 6d37efd..a5eaedf 100644 --- a/server/protocols/jmap-integration-testing/jmap-integration-testing-common/src/test/resources/eml/windows1252charset.eml +++ b/server/protocols/jmap-integration-testing/jmap-integration-testing-common/src/test/resources/eml/windows1252charset.eml @@ -24,20 +24,20 @@ This is a multi-part message in MIME format. Content-Type: text/plain; charset=windows-1252; format=flowed Content-Transfer-Encoding: 8bit -à à à à +���� -éééé +���� -èèèè +���� --------------4609A4E8FA0583CE89472864 Content-Type: text/html; charset=windows-1252 Content-Transfer-Encoding: 8bit <html> - <p>à à à à </p> - <p>éééé</p> - <p>èèèè</p> + <p>����</p> + <p>����</p> + <p>����</p> </html> --------------4609A4E8FA0583CE89472864-- --------------------------------------------------------------------- To unsubscribe, e-mail: server-dev-unsubscr...@james.apache.org For additional commands, e-mail: server-dev-h...@james.apache.org