JAMES-2137 Sanitize and test PDFTextExtractor
Project: http://git-wip-us.apache.org/repos/asf/james-project/repo Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/11e336a3 Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/11e336a3 Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/11e336a3 Branch: refs/heads/master Commit: 11e336a32c23e7dad43ef4d373d9df04c03c6935 Parents: ad33cef Author: benwa <btell...@linagora.com> Authored: Fri Sep 8 10:03:38 2017 +0700 Committer: Antoine Duprat <adup...@linagora.com> Committed: Fri Sep 8 21:56:59 2017 +0200 ---------------------------------------------------------------------- .../mailbox/store/search/PDFTextExtractor.java | 26 +++---- .../store/search/PDFTextExtractorTest.java | 75 +++++++++++++++++++ .../scanning-search/src/test/resources/pdf.pdf | Bin 0 -> 14707 bytes .../mailbox/store/search/MessageSearches.java | 21 ++++-- 4 files changed, 101 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/james-project/blob/11e336a3/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java ---------------------------------------------------------------------- diff --git a/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java index 1a5b5eb..1e21b7e 100644 --- a/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java +++ b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractor.java @@ -28,36 +28,32 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import com.google.common.base.Charsets; +import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableMap; public class PDFTextExtractor implements TextExtractor { - private static final String PDF_TYPE = "application/pdf"; + static final String PDF_TYPE = "application/pdf"; @Override public ParsedContent extractContent(InputStream inputStream, String contentType) throws Exception { + Preconditions.checkNotNull(inputStream); + Preconditions.checkNotNull(contentType); + if (isPDF(contentType)) { return extractTextFromPDF(inputStream); } - try { - return new ParsedContent(IOUtils.toString(inputStream, Charsets.UTF_8), ImmutableMap.of()); - } catch (IOException e) { - return new ParsedContent(null, ImmutableMap.of()); - } + return new ParsedContent(IOUtils.toString(inputStream, Charsets.UTF_8), ImmutableMap.of()); } private boolean isPDF(String contentType) { return contentType.equals(PDF_TYPE); } - private ParsedContent extractTextFromPDF(InputStream inputStream) { - try { - return new ParsedContent( - new PDFTextStripper().getText( - PDDocument.load(inputStream)), - ImmutableMap.of()); - } catch (IOException e) { - return new ParsedContent(null, ImmutableMap.of()); - } + private ParsedContent extractTextFromPDF(InputStream inputStream) throws IOException { + return new ParsedContent( + new PDFTextStripper().getText( + PDDocument.load(inputStream)), + ImmutableMap.of()); } } http://git-wip-us.apache.org/repos/asf/james-project/blob/11e336a3/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java ---------------------------------------------------------------------- diff --git a/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java new file mode 100644 index 0000000..df52009 --- /dev/null +++ b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java @@ -0,0 +1,75 @@ +/**************************************************************** + * Licensed to the Apache Software Foundation (ASF) under one * + * or more contributor license agreements. See the NOTICE file * + * distributed with this work for additional information * + * regarding copyright ownership. The ASF licenses this file * + * to you under the Apache License, Version 2.0 (the * + * "License"); you may not use this file except in compliance * + * with the License. You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, * + * software distributed under the License is distributed on an * + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * + * KIND, either express or implied. See the License for the * + * specific language governing permissions and limitations * + * under the License. * + ****************************************************************/ + +package org.apache.james.mailbox.store.search; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; + +import org.junit.Before; +import org.junit.Test; + +public class PDFTextExtractorTest { + + private PDFTextExtractor testee; + + @Before + public void setUp() { + testee = new PDFTextExtractor(); + } + + @Test + public void extractContentShouldThrowWhenNullInputStream() throws Exception { + assertThatThrownBy(() -> + testee.extractContent(null, "any/any")) + .isInstanceOf(NullPointerException.class); + } + + @Test + public void extractContentShouldThrowWhenNullContentType() throws Exception { + InputStream inputStream = new ByteArrayInputStream("content".getBytes(StandardCharsets.UTF_8)); + assertThatThrownBy(() -> testee.extractContent(inputStream, null)) + .isInstanceOf(NullPointerException.class); + } + + @Test + public void extractContentShouldExtractPlainText() throws Exception { + String content = "content"; + InputStream inputStream = new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8)); + + assertThat(testee.extractContent(inputStream, "text/plain") + .getTextualContent()) + .isEqualTo(content); + } + + @Test + public void extractContentShouldExtractPDF() throws Exception { + String content = "Little PDF"; + InputStream inputStream = ClassLoader.getSystemResourceAsStream("pdf.pdf"); + + assertThat(testee.extractContent(inputStream, PDFTextExtractor.PDF_TYPE) + .getTextualContent()) + .contains(content); + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/james-project/blob/11e336a3/mailbox/scanning-search/src/test/resources/pdf.pdf ---------------------------------------------------------------------- diff --git a/mailbox/scanning-search/src/test/resources/pdf.pdf b/mailbox/scanning-search/src/test/resources/pdf.pdf new file mode 100644 index 0000000..5388d4a Binary files /dev/null and b/mailbox/scanning-search/src/test/resources/pdf.pdf differ http://git-wip-us.apache.org/repos/asf/james-project/blob/11e336a3/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java ---------------------------------------------------------------------- diff --git a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java index d34c522..dc35559 100644 --- a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java +++ b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java @@ -35,13 +35,13 @@ import java.util.List; import java.util.Locale; import java.util.Optional; import java.util.TimeZone; +import java.util.stream.Stream; import javax.mail.Flags; import org.apache.james.mailbox.MessageUid; import org.apache.james.mailbox.exception.MailboxException; import org.apache.james.mailbox.exception.UnsupportedSearchException; -import org.apache.james.mailbox.extractor.ParsedContent; import org.apache.james.mailbox.extractor.TextExtractor; import org.apache.james.mailbox.model.Attachment; import org.apache.james.mailbox.model.MessageAttachment; @@ -77,9 +77,7 @@ import org.apache.james.mime4j.utils.search.MessageMatcher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.github.fge.lambdas.Throwing; import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; /** @@ -252,12 +250,23 @@ public class MessageSearches implements Iterable<SimpleMessageSearchIndex.Search private boolean isInAttachments(String value, List<MessageAttachment> attachments) { return attachments.stream() .map(MessageAttachment::getAttachment) - .map(Throwing.function((Attachment attachment) -> textExtractor.extractContent(attachment.getStream(), attachment.getType())) - .orReturn(new ParsedContent(null, ImmutableMap.of()))) - .map(ParsedContent::getTextualContent) + .flatMap(this::toAttachmentContent) .anyMatch(string -> string.contains(value)); } + private Stream<String> toAttachmentContent(Attachment attachment) { + try { + return Stream.of(textExtractor + .extractContent( + attachment.getStream(), + attachment.getType()) + .getTextualContent()); + } catch (Exception e) { + LOGGER.error("Error while parsing attachment content", e); + return Stream.of(); + } + } + private InputStream textHeaders(MailboxMessage message) throws MimeIOException, IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); new DefaultMessageWriter() --------------------------------------------------------------------- To unsubscribe, e-mail: server-dev-unsubscr...@james.apache.org For additional commands, e-mail: server-dev-h...@james.apache.org