This is an automated email from the ASF dual-hosted git repository. rcordier pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/james-project.git
commit 2958ee9f6f0d34f61f91950cb907156d7f39dc32 Author: Tran Tien Duc <[email protected]> AuthorDate: Fri Feb 7 15:31:31 2020 +0700 JAMES-3044 Test to prove JsoupTextExtractor fails on null characters --- .../store/extractor/JsoupTextExtractorTest.java | 30 ++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractorTest.java b/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractorTest.java index 64bd9b9..2a8ec70 100644 --- a/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractorTest.java +++ b/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractorTest.java @@ -20,6 +20,7 @@ package org.apache.james.mailbox.store.extractor; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatCode; import java.io.ByteArrayInputStream; import java.io.InputStream; @@ -28,9 +29,13 @@ import java.nio.charset.StandardCharsets; import org.apache.james.mailbox.extractor.ParsedContent; import org.apache.james.mailbox.extractor.TextExtractor; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; class JsoupTextExtractorTest { + + private static final String TEXT_HTML_CONTENT_TYPE = "text/html"; + TextExtractor textExtractor; @BeforeEach @@ -42,7 +47,7 @@ class JsoupTextExtractorTest { void extractedTextFromHtmlShouldNotContainTheContentOfTitleTag() throws Exception { InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/html.txt"); - assertThat(textExtractor.extractContent(inputStream, "text/html").getTextualContent().get()) + assertThat(textExtractor.extractContent(inputStream, TEXT_HTML_CONTENT_TYPE).getTextualContent().get()) .doesNotContain("*|MC:SUBJECT|*"); } @@ -64,7 +69,7 @@ class JsoupTextExtractorTest { @Test void extractContentShouldReturnEmptyWhenNullData() throws Exception { - assertThat(textExtractor.extractContent(null, "text/html")) + assertThat(textExtractor.extractContent(null, TEXT_HTML_CONTENT_TYPE)) .isEqualTo(ParsedContent.empty()); } @@ -76,4 +81,25 @@ class JsoupTextExtractorTest { .isEqualTo(ParsedContent.empty()); } + @Disabled("JAMES-3044 java.io.IOException: Input is binary and unsupported") + @Test + void extractContentShouldNotThrowWhenContainingNullCharacters() { + InputStream inputStream = textContentWithManyNullCharacters(); + + assertThatCode(() -> textExtractor.extractContent(inputStream, TEXT_HTML_CONTENT_TYPE)) + .doesNotThrowAnyException(); + } + + private InputStream textContentWithManyNullCharacters() { + String htmlTextContent = "HTML pages can include a lot of null '\0' character. But still expecting the content can be parsed." + + "Jsoup 1.21.1 thinks a file containing more than 10 null characters can be a binary file"; + byte[] htmlBytesContent = htmlTextContent.getBytes(StandardCharsets.UTF_8); + byte[] nullCharacters = {'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0'}; + + byte[] fullContent = new byte[htmlBytesContent.length + nullCharacters.length]; + System.arraycopy(htmlBytesContent, 0, fullContent, 0, htmlBytesContent.length); + System.arraycopy(nullCharacters, 0, fullContent, htmlBytesContent.length, nullCharacters.length); + + return new ByteArrayInputStream(fullContent); + } } \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
