[james-project] 04/09: JAMES-3044 Test to prove JsoupTextExtractor fails on null characters

rcordier Tue, 11 Feb 2020 20:15:27 -0800

This is an automated email from the ASF dual-hosted git repository.

rcordier pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/james-project.git


commit 2958ee9f6f0d34f61f91950cb907156d7f39dc32
Author: Tran Tien Duc <[email protected]>
AuthorDate: Fri Feb 7 15:31:31 2020 +0700

    JAMES-3044 Test to prove JsoupTextExtractor fails on null characters
---
 .../store/extractor/JsoupTextExtractorTest.java    | 30 ++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git 
a/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractorTest.java
 
b/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractorTest.java
index 64bd9b9..2a8ec70 100644
--- 
a/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractorTest.java
+++ 
b/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractorTest.java
@@ -20,6 +20,7 @@
 package org.apache.james.mailbox.store.extractor;
 
 import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatCode;
 
 import java.io.ByteArrayInputStream;
 import java.io.InputStream;
@@ -28,9 +29,13 @@ import java.nio.charset.StandardCharsets;
 import org.apache.james.mailbox.extractor.ParsedContent;
 import org.apache.james.mailbox.extractor.TextExtractor;
 import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 class JsoupTextExtractorTest {
+
+    private static final String TEXT_HTML_CONTENT_TYPE = "text/html";
+
     TextExtractor textExtractor;
 
     @BeforeEach
@@ -42,7 +47,7 @@ class JsoupTextExtractorTest {
     void extractedTextFromHtmlShouldNotContainTheContentOfTitleTag() throws 
Exception {
         InputStream inputStream = 
ClassLoader.getSystemResourceAsStream("documents/html.txt");
 
-        assertThat(textExtractor.extractContent(inputStream, 
"text/html").getTextualContent().get())
+        assertThat(textExtractor.extractContent(inputStream, 
TEXT_HTML_CONTENT_TYPE).getTextualContent().get())
                 .doesNotContain("*|MC:SUBJECT|*");
     }
 
@@ -64,7 +69,7 @@ class JsoupTextExtractorTest {
 
     @Test
     void extractContentShouldReturnEmptyWhenNullData() throws Exception {
-        assertThat(textExtractor.extractContent(null, "text/html"))
+        assertThat(textExtractor.extractContent(null, TEXT_HTML_CONTENT_TYPE))
             .isEqualTo(ParsedContent.empty());
     }
 
@@ -76,4 +81,25 @@ class JsoupTextExtractorTest {
             .isEqualTo(ParsedContent.empty());
     }
 
+    @Disabled("JAMES-3044 java.io.IOException: Input is binary and 
unsupported")
+    @Test
+    void extractContentShouldNotThrowWhenContainingNullCharacters() {
+        InputStream inputStream = textContentWithManyNullCharacters();
+
+        assertThatCode(() -> textExtractor.extractContent(inputStream, 
TEXT_HTML_CONTENT_TYPE))
+            .doesNotThrowAnyException();
+    }
+
+    private InputStream textContentWithManyNullCharacters() {
+        String htmlTextContent = "HTML pages can include a lot of null '\0' 
character. But still expecting the content can be parsed." +
+            "Jsoup 1.21.1 thinks a file containing more than 10 null 
characters can be a binary file";
+        byte[] htmlBytesContent = 
htmlTextContent.getBytes(StandardCharsets.UTF_8);
+        byte[] nullCharacters = {'\0', '\0', '\0', '\0', '\0', '\0', '\0', 
'\0', '\0', '\0', '\0'};
+
+        byte[] fullContent = new byte[htmlBytesContent.length + 
nullCharacters.length];
+        System.arraycopy(htmlBytesContent, 0, fullContent, 0, 
htmlBytesContent.length);
+        System.arraycopy(nullCharacters, 0, fullContent, 
htmlBytesContent.length, nullCharacters.length);
+
+        return new ByteArrayInputStream(fullContent);
+    }
 }
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[james-project] 04/09: JAMES-3044 Test to prove JsoupTextExtractor fails on null characters

Reply via email to