This is an automated email from the ASF dual-hosted git repository. btellier pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/james-project.git
commit c9d4adf901ff1f575b5fcf3cbf4291c7cfca9e89 Author: Benoit Tellier <[email protected]> AuthorDate: Wed Mar 2 14:45:44 2022 +0700 JAMES-3719 Avoid copying mime part content when we are not to extract its content --- .../james/mailbox/extractor/TextExtractor.java | 3 ++ .../mailbox/elasticsearch/v7/json/MimePart.java | 52 +++++++++++----------- .../v7/json/MimePartContainerBuilder.java | 3 -- .../elasticsearch/v7/json/MimePartParser.java | 4 +- .../v7/json/RootMimePartContainerBuilder.java | 5 --- .../v7/json/IndexableMessageTest.java | 1 + .../elasticsearch/v7/json/MimePartTest.java | 4 +- .../store/extractor/DefaultTextExtractor.java | 6 ++- .../store/extractor/JsoupTextExtractor.java | 8 ++++ .../tika/ContentTypeFilteringTextExtractor.java | 5 +++ 10 files changed, 51 insertions(+), 40 deletions(-) diff --git a/mailbox/api/src/main/java/org/apache/james/mailbox/extractor/TextExtractor.java b/mailbox/api/src/main/java/org/apache/james/mailbox/extractor/TextExtractor.java index 2fdf7e555e..2822ee02e8 100644 --- a/mailbox/api/src/main/java/org/apache/james/mailbox/extractor/TextExtractor.java +++ b/mailbox/api/src/main/java/org/apache/james/mailbox/extractor/TextExtractor.java @@ -26,6 +26,9 @@ import org.apache.james.mailbox.model.ContentType; import reactor.core.publisher.Mono; public interface TextExtractor { + default boolean applicable(ContentType contentType) { + return true; + } ParsedContent extractContent(InputStream inputStream, ContentType contentType) throws Exception; diff --git a/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePart.java b/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePart.java index 918306fddf..8a286b057d 100644 --- a/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePart.java +++ b/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePart.java @@ -25,6 +25,7 @@ import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Optional; +import java.util.function.Predicate; import java.util.stream.Stream; import org.apache.commons.io.FilenameUtils; @@ -34,7 +35,6 @@ import org.apache.james.mailbox.extractor.TextExtractor; import org.apache.james.mailbox.model.ContentType; import org.apache.james.mailbox.model.ContentType.MediaType; import org.apache.james.mailbox.model.ContentType.SubType; -import org.apache.james.mailbox.store.extractor.DefaultTextExtractor; import org.apache.james.mime4j.stream.Field; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -42,7 +42,6 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; import com.github.fge.lambdas.Throwing; -import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; @@ -62,9 +61,10 @@ public class MimePart { private Optional<String> fileExtension; private Optional<String> contentDisposition; private Optional<Charset> charset; - private TextExtractor textExtractor; + private Predicate<ContentType> shouldCaryOverContent; - private Builder() { + private Builder(Predicate<ContentType> shouldCaryOverContent) { + this.shouldCaryOverContent = shouldCaryOverContent; children = Lists.newArrayList(); headerCollectionBuilder = HeaderCollection.builder(); this.bodyContent = Optional.empty(); @@ -74,7 +74,6 @@ public class MimePart { this.fileExtension = Optional.empty(); this.contentDisposition = Optional.empty(); this.charset = Optional.empty(); - this.textExtractor = new DefaultTextExtractor(); } @Override @@ -120,27 +119,32 @@ public class MimePart { return this; } - @Override - public MimePartContainerBuilder using(TextExtractor textExtractor) { - Preconditions.checkArgument(textExtractor != null, "Provided text extractor should not be null"); - this.textExtractor = textExtractor; - return this; - } - @Override public MimePartContainerBuilder charset(Charset charset) { this.charset = Optional.of(charset); return this; } + private Optional<ContentType> computeContentType() { + if (mediaType.isPresent() && subType.isPresent()) { + return Optional.of(ContentType.of( + ContentType.MimeType.of(mediaType.get(), subType.get()), + charset)); + } else { + return Optional.empty(); + } + } + @Override public ParsedMimePart build() { + final Optional<ContentType> contentType = computeContentType(); return new ParsedMimePart( headerCollectionBuilder.build(), - bodyContent, + bodyContent.filter(any -> shouldCaryOverContent.test(contentType.orElse(null))), charset, mediaType, subType, + contentType, fileName, fileExtension, contentDisposition, @@ -154,6 +158,7 @@ public class MimePart { private final Optional<Charset> charset; private final Optional<MediaType> mediaType; private final Optional<SubType> subType; + private Optional<ContentType> contentType; private final Optional<String> fileName; private final Optional<String> fileExtension; private final Optional<String> contentDisposition; @@ -161,17 +166,19 @@ public class MimePart { public ParsedMimePart(HeaderCollection headerCollection, Optional<InputStream> bodyContent, Optional<Charset> charset, Optional<MediaType> mediaType, - Optional<SubType> subType, Optional<String> fileName, Optional<String> fileExtension, + Optional<SubType> subType, Optional<ContentType> contentType, Optional<String> fileName, Optional<String> fileExtension, Optional<String> contentDisposition, List<ParsedMimePart> attachments) { this.headerCollection = headerCollection; - this.bodyContent = bodyContent.map(Throwing.function(IOUtils::toByteArray)); this.mediaType = mediaType; this.subType = subType; + this.contentType = contentType; this.fileName = fileName; this.fileExtension = fileExtension; this.contentDisposition = contentDisposition; this.attachments = attachments; this.charset = charset; + + this.bodyContent = bodyContent.map(Throwing.function(IOUtils::toByteArray)); } public Mono<MimePart> asMimePart(TextExtractor textExtractor) { @@ -196,7 +203,7 @@ public class MimePart { if (shouldPerformTextExtraction()) { return textExtractor.extractContentReactive( new ByteArrayInputStream(bodyContent.get()), - computeContentType().orElse(null)); + contentType.orElse(null)); } return Mono.fromCallable(() -> new ParsedContent( Optional.ofNullable(IOUtils.toString(new ByteArrayInputStream(bodyContent.get()), charset.orElse(StandardCharsets.UTF_8))), @@ -215,19 +222,10 @@ public class MimePart { return isTextBody() && subType.map(SubType.of("html")::equals).orElse(false); } - private Optional<ContentType> computeContentType() { - if (mediaType.isPresent() && subType.isPresent()) { - return Optional.of(ContentType.of( - ContentType.MimeType.of(mediaType.get(), subType.get()), - charset)); - } else { - return Optional.empty(); - } - } } - public static Builder builder() { - return new Builder(); + public static Builder builder(Predicate<ContentType> shouldCaryOverContent) { + return new Builder(shouldCaryOverContent); } private static final Logger LOGGER = LoggerFactory.getLogger(MimePart.class); diff --git a/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePartContainerBuilder.java b/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePartContainerBuilder.java index d3d88c4f67..5e4fb1ee89 100644 --- a/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePartContainerBuilder.java +++ b/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePartContainerBuilder.java @@ -22,7 +22,6 @@ package org.apache.james.mailbox.elasticsearch.v7.json; import java.io.InputStream; import java.nio.charset.Charset; -import org.apache.james.mailbox.extractor.TextExtractor; import org.apache.james.mailbox.model.ContentType.MediaType; import org.apache.james.mailbox.model.ContentType.SubType; import org.apache.james.mime4j.stream.Field; @@ -31,8 +30,6 @@ public interface MimePartContainerBuilder { MimePart.ParsedMimePart build(); - MimePartContainerBuilder using(TextExtractor textExtractor); - MimePartContainerBuilder addToHeaders(Field field); MimePartContainerBuilder addBodyContent(InputStream bodyContent); diff --git a/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePartParser.java b/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePartParser.java index 81a1a15e8a..ef6c18f50f 100644 --- a/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePartParser.java +++ b/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePartParser.java @@ -78,7 +78,7 @@ public class MimePartParser { stackCurrent(); break; case T_START_HEADER: - currentlyBuildMimePart = MimePart.builder(); + currentlyBuildMimePart = MimePart.builder(textExtractor::applicable); break; case T_FIELD: currentlyBuildMimePart.addToHeaders(stream.getField()); @@ -107,7 +107,7 @@ public class MimePartParser { } private void closeMimePart() { - MimePart.ParsedMimePart bodyMimePart = currentlyBuildMimePart.using(textExtractor).build(); + MimePart.ParsedMimePart bodyMimePart = currentlyBuildMimePart.build(); if (!builderStack.isEmpty()) { builderStack.peek().addChild(bodyMimePart); } else { diff --git a/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/RootMimePartContainerBuilder.java b/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/RootMimePartContainerBuilder.java index 93755a6790..fc394205ca 100644 --- a/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/RootMimePartContainerBuilder.java +++ b/mailbox/elasticsearch-v7/src/main/java/org/apache/james/mailbox/elasticsearch/v7/json/RootMimePartContainerBuilder.java @@ -22,7 +22,6 @@ package org.apache.james.mailbox.elasticsearch.v7.json; import java.io.InputStream; import java.nio.charset.Charset; -import org.apache.james.mailbox.extractor.TextExtractor; import org.apache.james.mailbox.model.ContentType.MediaType; import org.apache.james.mailbox.model.ContentType.SubType; import org.apache.james.mime4j.stream.Field; @@ -40,10 +39,6 @@ public class RootMimePartContainerBuilder implements MimePartContainerBuilder { return rootMimePart; } - @Override public MimePartContainerBuilder using(TextExtractor textExtractor) { - return this; - } - @Override public MimePartContainerBuilder addToHeaders(Field field) { LOGGER.warn("Trying to add headers to the Root MimePart container"); diff --git a/mailbox/elasticsearch-v7/src/test/java/org/apache/james/mailbox/elasticsearch/v7/json/IndexableMessageTest.java b/mailbox/elasticsearch-v7/src/test/java/org/apache/james/mailbox/elasticsearch/v7/json/IndexableMessageTest.java index 62e4d85871..c51cdb4929 100644 --- a/mailbox/elasticsearch-v7/src/test/java/org/apache/james/mailbox/elasticsearch/v7/json/IndexableMessageTest.java +++ b/mailbox/elasticsearch-v7/src/test/java/org/apache/james/mailbox/elasticsearch/v7/json/IndexableMessageTest.java @@ -232,6 +232,7 @@ class IndexableMessageTest { .thenReturn(MESSAGE_UID); TextExtractor textExtractor = mock(TextExtractor.class); + when(textExtractor.applicable(any())).thenReturn(true); when(textExtractor.extractContentReactive(any(), any())) .thenReturn(Mono.just(new ParsedContent(Optional.of("first attachment content"), ImmutableMap.of()))) .thenReturn(Mono.error(new RuntimeException("second cannot be parsed"))) diff --git a/mailbox/elasticsearch-v7/src/test/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePartTest.java b/mailbox/elasticsearch-v7/src/test/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePartTest.java index f2afdd0793..072e8d493b 100644 --- a/mailbox/elasticsearch-v7/src/test/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePartTest.java +++ b/mailbox/elasticsearch-v7/src/test/java/org/apache/james/mailbox/elasticsearch/v7/json/MimePartTest.java @@ -32,7 +32,7 @@ class MimePartTest { @Test void buildShouldWorkWhenTextualContentFromParserIsEmpty() { - MimePart.builder() + MimePart.builder(contentType -> true) .addBodyContent(new ByteArrayInputStream(new byte[] {})) .addMediaType(MediaType.of("text")) .addSubType(SubType.of("plain")) @@ -42,7 +42,7 @@ class MimePartTest { @Test void buildShouldWorkWhenTextualContentFromParserIsNonEmpty() { String body = "text"; - MimePart mimePart = MimePart.builder() + MimePart mimePart = MimePart.builder(contentType -> true) .addBodyContent(new ByteArrayInputStream(body.getBytes(StandardCharsets.UTF_8))) .addMediaType(MediaType.of("text")) .addSubType(SubType.of("plain")) diff --git a/mailbox/store/src/main/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractor.java b/mailbox/store/src/main/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractor.java index 65c03bb625..50cc8b68e3 100644 --- a/mailbox/store/src/main/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractor.java +++ b/mailbox/store/src/main/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractor.java @@ -36,10 +36,14 @@ import org.apache.james.mailbox.model.ContentType; * Costs less calculations that TikaTextExtractor, but result is not that good. */ public class DefaultTextExtractor implements TextExtractor { + @Override + public boolean applicable(ContentType contentType) { + return contentType != null && contentType.asString().startsWith("text/"); + } @Override public ParsedContent extractContent(InputStream inputStream, ContentType contentType) throws Exception { - if (contentType != null && contentType.asString().startsWith("text/")) { + if (applicable(contentType)) { Charset charset = contentType.charset().orElse(StandardCharsets.UTF_8); return new ParsedContent(Optional.ofNullable(IOUtils.toString(inputStream, charset)), new HashMap<>()); } else { diff --git a/mailbox/store/src/main/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractor.java b/mailbox/store/src/main/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractor.java index 45378aefb8..b06f55ffc0 100644 --- a/mailbox/store/src/main/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractor.java +++ b/mailbox/store/src/main/java/org/apache/james/mailbox/store/extractor/JsoupTextExtractor.java @@ -44,6 +44,14 @@ public class JsoupTextExtractor implements TextExtractor { private static final MimeType TEXT_HTML = MimeType.of("text/html"); private static final MimeType TEXT_PLAIN = MimeType.of("text/plain"); + @Override + public boolean applicable(ContentType contentType) { + if (contentType == null) { + return false; + } + return contentType.mimeType().equals(TEXT_HTML) || contentType.mimeType().equals(TEXT_PLAIN); + } + @Override public ParsedContent extractContent(InputStream inputStream, ContentType contentType) throws Exception { if (inputStream == null || contentType == null) { diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractor.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractor.java index 63cf46ee3f..80538174de 100644 --- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractor.java +++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractor.java @@ -40,6 +40,11 @@ public class ContentTypeFilteringTextExtractor implements TextExtractor { this.contentTypeBlacklist = contentTypeBlacklist; } + @Override + public boolean applicable(ContentType contentType) { + return !isBlacklisted(contentType.mimeType()); + } + @Override public ParsedContent extractContent(InputStream inputStream, ContentType contentType) throws Exception { if (isBlacklisted(contentType.mimeType())) { --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
