Repository: james-project
Updated Branches:
  refs/heads/master 30d6d3c30 -> 77ee834cb


JAMES-2005 Respect the body charset when extracting its content


Project: http://git-wip-us.apache.org/repos/asf/james-project/repo
Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/6fca7292
Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/6fca7292
Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/6fca7292

Branch: refs/heads/master
Commit: 6fca72923607d331f753e242334ce2496ab09b4e
Parents: 30d6d3c
Author: Antoine Duprat <adup...@linagora.com>
Authored: Tue Apr 18 11:54:44 2017 +0200
Committer: Antoine Duprat <adup...@linagora.com>
Committed: Tue Apr 18 15:49:42 2017 +0200

----------------------------------------------------------------------
 .../util/mime/MessageContentExtractor.java      | 10 ++++--
 .../util/mime/MessageContentExtractorTest.java  | 32 +++++++++++++++++++-
 .../test/resources/eml/windows1252charset.eml   | 12 ++++----
 3 files changed, 45 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/james-project/blob/6fca7292/server/container/util-java8/src/main/java/org/apache/james/util/mime/MessageContentExtractor.java
----------------------------------------------------------------------
diff --git 
a/server/container/util-java8/src/main/java/org/apache/james/util/mime/MessageContentExtractor.java
 
b/server/container/util-java8/src/main/java/org/apache/james/util/mime/MessageContentExtractor.java
index 7f819aa..99c320e 100644
--- 
a/server/container/util-java8/src/main/java/org/apache/james/util/mime/MessageContentExtractor.java
+++ 
b/server/container/util-java8/src/main/java/org/apache/james/util/mime/MessageContentExtractor.java
@@ -20,6 +20,7 @@
 package org.apache.james.util.mime;
 
 import java.io.IOException;
+import java.nio.charset.Charset;
 import java.util.Objects;
 import java.util.Optional;
 import java.util.function.Function;
@@ -36,7 +37,6 @@ import org.apache.james.mime4j.dom.TextBody;
 
 import com.github.fge.lambdas.Throwing;
 import com.github.fge.lambdas.functions.ThrowingFunction;
-import com.google.common.base.Charsets;
 
 public class MessageContentExtractor {
 
@@ -92,7 +92,13 @@ public class MessageContentExtractor {
     }
 
     private Optional<String> asString(TextBody textBody) throws IOException {
-        return Optional.ofNullable(IOUtils.toString(textBody.getInputStream(), 
Charsets.UTF_8));
+        return Optional.ofNullable(IOUtils.toString(textBody.getInputStream(), 
charset(Optional.ofNullable(textBody.getMimeCharset()))));
+    }
+
+    private Charset charset(Optional<String> charset) {
+        return charset
+                .map(Charset::forName)
+                .orElse(org.apache.james.mime4j.Charsets.DEFAULT_CHARSET);
     }
 
     private MessageContent retrieveHtmlAndPlainTextContent(Multipart 
multipart) throws IOException {

http://git-wip-us.apache.org/repos/asf/james-project/blob/6fca7292/server/container/util-java8/src/test/java/org/apache/james/util/mime/MessageContentExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/server/container/util-java8/src/test/java/org/apache/james/util/mime/MessageContentExtractorTest.java
 
b/server/container/util-java8/src/test/java/org/apache/james/util/mime/MessageContentExtractorTest.java
index 9607564..0a4750f 100644
--- 
a/server/container/util-java8/src/test/java/org/apache/james/util/mime/MessageContentExtractorTest.java
+++ 
b/server/container/util-java8/src/test/java/org/apache/james/util/mime/MessageContentExtractorTest.java
@@ -21,6 +21,7 @@ package org.apache.james.util.mime;
 import static org.assertj.core.api.Assertions.assertThat;
 
 import java.io.IOException;
+import java.nio.charset.Charset;
 import java.util.Optional;
 
 import javax.mail.internet.MimeMessage;
@@ -36,7 +37,6 @@ import org.apache.james.mime4j.message.MessageBuilder;
 import org.apache.james.mime4j.message.MultipartBuilder;
 import org.apache.james.mime4j.stream.Field;
 import org.apache.james.mime4j.util.ByteSequence;
-import org.apache.james.util.mime.MessageContentExtractor;
 import org.apache.james.util.mime.MessageContentExtractor.MessageContent;
 import org.junit.Before;
 import org.junit.Test;
@@ -488,4 +488,34 @@ public class MessageContentExtractorTest {
 
         assertThat(actual).isEqualTo(expected);
     }
+
+    @Test
+    public void extractShouldRespectCharsetWhenOtherThanUTF8() throws 
IOException {
+        String text = "éééé\r\nèèèè\r\nàààà";
+        Message message = MessageBuilder.create()
+                .setBody(text, Charset.forName("windows-1252"))
+                .build();
+        MessageContent actual = testee.extract(message);
+        assertThat(actual.getTextBody()).contains(text);
+    }
+
+    @Test
+    public void extractShouldRespectCharsetWhenUTF8() throws IOException {
+        String text = "éééé\r\nèèèè\r\nàààà";
+        Message message = MessageBuilder.create()
+                .setBody(text, Charsets.UTF_8)
+                .build();
+        MessageContent actual = testee.extract(message);
+        assertThat(actual.getTextBody()).contains(text);
+    }
+
+    @Test
+    public void extractShouldUseUSASCIIWhenNoCharset() throws IOException {
+        String text = "éééé\r\nèèèè\r\nàààà";
+        Message message = MessageBuilder.create()
+                .setBody(text, null)
+                .build();
+        MessageContent actual = testee.extract(message);
+        assertThat(actual.getTextBody()).contains("????\r\n????\r\n????");
+    }
 }

http://git-wip-us.apache.org/repos/asf/james-project/blob/6fca7292/server/protocols/jmap-integration-testing/jmap-integration-testing-common/src/test/resources/eml/windows1252charset.eml
----------------------------------------------------------------------
diff --git 
a/server/protocols/jmap-integration-testing/jmap-integration-testing-common/src/test/resources/eml/windows1252charset.eml
 
b/server/protocols/jmap-integration-testing/jmap-integration-testing-common/src/test/resources/eml/windows1252charset.eml
index 6d37efd..a5eaedf 100644
--- 
a/server/protocols/jmap-integration-testing/jmap-integration-testing-common/src/test/resources/eml/windows1252charset.eml
+++ 
b/server/protocols/jmap-integration-testing/jmap-integration-testing-common/src/test/resources/eml/windows1252charset.eml
@@ -24,20 +24,20 @@ This is a multi-part message in MIME format.
 Content-Type: text/plain; charset=windows-1252; format=flowed
 Content-Transfer-Encoding: 8bit
 
-àààà
+����
 
-éééé
+����
 
-èèèè
+����
 
 --------------4609A4E8FA0583CE89472864
 Content-Type: text/html; charset=windows-1252
 Content-Transfer-Encoding: 8bit
 
 <html>
-  <p>àààà</p>
-  <p>éééé</p>
-  <p>èèèè</p>
+  <p>����</p>
+  <p>����</p>
+  <p>����</p>
 </html>
 
 --------------4609A4E8FA0583CE89472864--


---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscr...@james.apache.org
For additional commands, e-mail: server-dev-h...@james.apache.org

Reply via email to