JAMES-2581 Using contentType blacklist from Tika configuration to Text Extractor
Project: http://git-wip-us.apache.org/repos/asf/james-project/repo Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/501d3436 Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/501d3436 Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/501d3436 Branch: refs/heads/master Commit: 501d3436e224a2d38a783425dc8117f200b8c656 Parents: 51f1d7b Author: datph <dphamho...@linagora.com> Authored: Tue Nov 6 18:34:00 2018 +0700 Committer: datph <dphamho...@linagora.com> Committed: Fri Nov 9 16:28:02 2018 +0700 ---------------------------------------------------------------------- .../tika/ContentTypeFilteringTextExtractor.java | 14 ++- .../tika/TextExtractorConfiguration.java | 94 -------------------- .../james/mailbox/tika/TikaConfiguration.java | 16 ++-- .../ContentTypeFilteringTextExtractorTest.java | 10 +-- .../tika/TextExtractorConfigurationTest.java | 60 ------------- .../mailbox/TikaConfigurationReader.java | 10 ++- .../modules/mailbox/TikaMailboxModule.java | 26 +----- .../mailbox/TikaConfigurationReaderTest.java | 8 +- 8 files changed, 30 insertions(+), 208 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/james-project/blob/501d3436/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractor.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractor.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractor.java index 9b18a2f..c648610 100644 --- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractor.java +++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractor.java @@ -20,19 +20,20 @@ package org.apache.james.mailbox.tika; import java.io.InputStream; -import java.util.Objects; import org.apache.james.mailbox.extractor.ParsedContent; import org.apache.james.mailbox.extractor.TextExtractor; +import com.google.common.collect.ImmutableSet; + public class ContentTypeFilteringTextExtractor implements TextExtractor { private final TextExtractor textExtractor; - private final TextExtractorConfiguration textExtractorConfiguration; + private final ImmutableSet<String> contentTypeBlacklist; - public ContentTypeFilteringTextExtractor(TextExtractor textExtractor, TextExtractorConfiguration textExtractorConfiguration) { + public ContentTypeFilteringTextExtractor(TextExtractor textExtractor, ImmutableSet<String> contentTypeBlacklist) { this.textExtractor = textExtractor; - this.textExtractorConfiguration = textExtractorConfiguration; + this.contentTypeBlacklist = contentTypeBlacklist; } @Override @@ -44,10 +45,7 @@ public class ContentTypeFilteringTextExtractor implements TextExtractor { } private boolean isBlacklisted(String contentType) { - return textExtractorConfiguration - .getContentTypeBlacklist() - .stream() - .anyMatch(blackListItem -> Objects.equals(blackListItem, contentType)); + return contentTypeBlacklist.contains(contentType); } } http://git-wip-us.apache.org/repos/asf/james-project/blob/501d3436/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TextExtractorConfiguration.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TextExtractorConfiguration.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TextExtractorConfiguration.java deleted file mode 100644 index df5400a..0000000 --- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TextExtractorConfiguration.java +++ /dev/null @@ -1,94 +0,0 @@ -/**************************************************************** - * Licensed to the Apache Software Foundation (ASF) under one * - * or more contributor license agreements. See the NOTICE file * - * distributed with this work for additional information * - * regarding copyright ownership. The ASF licenses this file * - * to you under the Apache License, Version 2.0 (the * - * "License"); you may not use this file except in compliance * - * with the License. You may obtain a copy of the License at * - * * - * http://www.apache.org/licenses/LICENSE-2.0 * - * * - * Unless required by applicable law or agreed to in writing, * - * software distributed under the License is distributed on an * - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * - * KIND, either express or implied. See the License for the * - * specific language governing permissions and limitations * - * under the License. * - ****************************************************************/ - -package org.apache.james.mailbox.tika; - -import java.util.List; -import java.util.Objects; - -import org.apache.commons.configuration.AbstractConfiguration; -import org.apache.commons.configuration.Configuration; -import org.apache.james.util.StreamUtils; - -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; - -public class TextExtractorConfiguration { - - private static final String TEXT_EXTRACTOR_CONTENT_TYPE_BLACKLIST = "textextractor.contentType.blacklist"; - - public static class Builder { - private ImmutableList.Builder<String> contentTypeBlacklist; - - public Builder contentTypeBlacklist(List<String> contentTypeBlacklist) { - Preconditions.checkNotNull(contentTypeBlacklist); - this.contentTypeBlacklist.addAll(contentTypeBlacklist); - return this; - } - - private Builder() { - contentTypeBlacklist = ImmutableList.builder(); - } - - public TextExtractorConfiguration build() { - return new TextExtractorConfiguration(contentTypeBlacklist.build()); - } - } - - public static Builder builder() { - return new Builder(); - } - - private final ImmutableList<String> contentTypeBlacklist; - - public TextExtractorConfiguration(ImmutableList<String> contentTypeBlacklist) { - this.contentTypeBlacklist = contentTypeBlacklist; - } - - public ImmutableList<String> getContentTypeBlacklist() { - return contentTypeBlacklist; - } - - public static TextExtractorConfiguration readTextExtractorConfiguration(Configuration configuration) { - AbstractConfiguration.setDefaultListDelimiter(','); - - List<String> contentTypeBlacklist = StreamUtils - .ofNullable(configuration.getStringArray(TEXT_EXTRACTOR_CONTENT_TYPE_BLACKLIST)) - .map(String::trim) - .collect(ImmutableList.toImmutableList()); - - return TextExtractorConfiguration.builder() - .contentTypeBlacklist(contentTypeBlacklist) - .build(); - } - - @Override - public final boolean equals(Object o) { - if (o instanceof TextExtractorConfiguration) { - TextExtractorConfiguration that = (TextExtractorConfiguration) o; - return Objects.equals(this.contentTypeBlacklist, that.contentTypeBlacklist); - } - return false; - } - - @Override - public final int hashCode() { - return Objects.hash(contentTypeBlacklist); - } -} http://git-wip-us.apache.org/repos/asf/james-project/blob/501d3436/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaConfiguration.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaConfiguration.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaConfiguration.java index ce978ab..e95a8e8 100644 --- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaConfiguration.java +++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaConfiguration.java @@ -20,15 +20,15 @@ package org.apache.james.mailbox.tika; import java.time.Duration; -import java.util.List; import java.util.Objects; import java.util.Optional; +import java.util.Set; import java.util.concurrent.TimeUnit; import org.apache.james.util.Port; import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; import com.google.common.primitives.Ints; public class TikaConfiguration { @@ -41,7 +41,7 @@ public class TikaConfiguration { private Optional<Integer> timeoutInMillis; private Optional<Duration> cacheEvictionPeriod; private Optional<Long> cacheWeightInBytes; - private ImmutableList.Builder<String> contentTypeBlacklist; + private ImmutableSet.Builder<String> contentTypeBlacklist; private Builder() { isEnabled = Optional.empty(); @@ -51,7 +51,7 @@ public class TikaConfiguration { timeoutInMillis = Optional.empty(); cacheEvictionPeriod = Optional.empty(); cacheWeightInBytes = Optional.empty(); - contentTypeBlacklist = ImmutableList.builder(); + contentTypeBlacklist = ImmutableSet.builder(); } public Builder enable(Optional<Boolean> isEnabled) { @@ -140,7 +140,7 @@ public class TikaConfiguration { return this; } - public Builder contentTypeBlacklist(List<String> contentTypeBlacklist) { + public Builder contentTypeBlacklist(Set<String> contentTypeBlacklist) { Preconditions.checkNotNull(contentTypeBlacklist); this.contentTypeBlacklist.addAll(contentTypeBlacklist); return this; @@ -179,9 +179,9 @@ public class TikaConfiguration { private final int timeoutInMillis; private final Duration cacheEvictionPeriod; private final long cacheWeightInBytes; - private final ImmutableList<String> contentTypeBlacklist; + private final ImmutableSet<String> contentTypeBlacklist; - private TikaConfiguration(boolean enabled, boolean cacheEnabled, String host, int port, int timeoutInMillis, Duration cacheEvictionPeriod, long cacheWeightInBytes, ImmutableList<String> contentTypeBlacklist) { + private TikaConfiguration(boolean enabled, boolean cacheEnabled, String host, int port, int timeoutInMillis, Duration cacheEvictionPeriod, long cacheWeightInBytes, ImmutableSet<String> contentTypeBlacklist) { this.enabled = enabled; this.cacheEnabled = cacheEnabled; this.host = host; @@ -220,7 +220,7 @@ public class TikaConfiguration { return cacheWeightInBytes; } - public List<String> getContentTypeBlacklist() { + public ImmutableSet<String> getContentTypeBlacklist() { return contentTypeBlacklist; } http://git-wip-us.apache.org/repos/asf/james-project/blob/501d3436/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractorTest.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractorTest.java b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractorTest.java index 9f9c2cd..194d621 100644 --- a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractorTest.java +++ b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractorTest.java @@ -35,7 +35,7 @@ import org.junit.Test; import org.mockito.Mock; import org.mockito.MockitoAnnotations; -import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; public class ContentTypeFilteringTextExtractorTest { @@ -51,9 +51,7 @@ public class ContentTypeFilteringTextExtractorTest { public void extractContentReturnEmptyWithContentTypeInBlacklist() throws Exception { ContentTypeFilteringTextExtractor contentTypeFilteringTextExtractor = new ContentTypeFilteringTextExtractor(textExtractor, - TextExtractorConfiguration.builder() - .contentTypeBlacklist(ImmutableList.of("application/ics", "application/zip")) - .build()); + ImmutableSet.of("application/ics", "application/zip")); assertThat(contentTypeFilteringTextExtractor .extractContent(IOUtils.toInputStream(""), "application/ics")) @@ -66,9 +64,7 @@ public class ContentTypeFilteringTextExtractorTest { InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/Text.txt"); ContentTypeFilteringTextExtractor contentTypeFilteringTextExtractor = new ContentTypeFilteringTextExtractor(textExtractor, - TextExtractorConfiguration.builder() - .contentTypeBlacklist(ImmutableList.of("application/ics", "application/zip")) - .build()); + ImmutableSet.of("application/ics", "application/zip")); contentTypeFilteringTextExtractor.extractContent(inputStream, "text/plain"); verify(textExtractor, times(1)).extractContent(any(), any()); http://git-wip-us.apache.org/repos/asf/james-project/blob/501d3436/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TextExtractorConfigurationTest.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TextExtractorConfigurationTest.java b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TextExtractorConfigurationTest.java deleted file mode 100644 index 15c0135..0000000 --- a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TextExtractorConfigurationTest.java +++ /dev/null @@ -1,60 +0,0 @@ -/**************************************************************** - * Licensed to the Apache Software Foundation (ASF) under one * - * or more contributor license agreements. See the NOTICE file * - * distributed with this work for additional information * - * regarding copyright ownership. The ASF licenses this file * - * to you under the Apache License, Version 2.0 (the * - * "License"); you may not use this file except in compliance * - * with the License. You may obtain a copy of the License at * - * * - * http://www.apache.org/licenses/LICENSE-2.0 * - * * - * Unless required by applicable law or agreed to in writing, * - * software distributed under the License is distributed on an * - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * - * KIND, either express or implied. See the License for the * - * specific language governing permissions and limitations * - * under the License. * - ****************************************************************/ - -package org.apache.james.mailbox.tika; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.StringReader; - -import org.apache.commons.configuration.ConfigurationException; -import org.apache.commons.configuration.PropertiesConfiguration; -import org.junit.Test; - -import com.google.common.collect.ImmutableList; - -import nl.jqno.equalsverifier.EqualsVerifier; - -public class TextExtractorConfigurationTest { - - @Test - public void shouldMatchBeanContract() { - EqualsVerifier.forClass(TextExtractorConfiguration.class) - .verify(); - } - - @Test - public void readTextExtractorConfigurationReturnEmptyWithNoBlacklist() { - PropertiesConfiguration configuration = new PropertiesConfiguration(); - - assertThat(TextExtractorConfiguration.readTextExtractorConfiguration(configuration)) - .isEqualTo(new TextExtractorConfiguration(ImmutableList.of())); - } - - @Test - public void readTextExtractorConfigurationReturnConfigurationWithBlacklist() throws ConfigurationException { - PropertiesConfiguration configuration = new PropertiesConfiguration(); - configuration.load(new StringReader("textextractor.contentType.blacklist=application/ics, application/zip")); - - assertThat(TextExtractorConfiguration.readTextExtractorConfiguration(configuration)) - .isEqualTo(TextExtractorConfiguration.builder() - .contentTypeBlacklist(ImmutableList.of("application/ics", "application/zip")) - .build()); - } -} http://git-wip-us.apache.org/repos/asf/james-project/blob/501d3436/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaConfigurationReader.java ---------------------------------------------------------------------- diff --git a/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaConfigurationReader.java b/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaConfigurationReader.java index 2ab5240..fa7438a 100644 --- a/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaConfigurationReader.java +++ b/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaConfigurationReader.java @@ -20,9 +20,10 @@ package org.apache.james.modules.mailbox; import java.time.Duration; -import java.util.List; import java.util.Optional; +import java.util.Set; +import org.apache.commons.configuration.AbstractConfiguration; import org.apache.commons.configuration.Configuration; import org.apache.james.mailbox.tika.TikaConfiguration; import org.apache.james.util.Size; @@ -30,7 +31,7 @@ import org.apache.james.util.StreamUtils; import org.apache.james.util.TimeConverter; import com.github.fge.lambdas.Throwing; -import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; public class TikaConfigurationReader { public static final String TIKA_ENABLED = "tika.enabled"; @@ -43,6 +44,7 @@ public class TikaConfigurationReader { public static final String TIKA_CONTENT_TYPE_BLACKLIST = "tika.contentType.blacklist"; public static TikaConfiguration readTikaConfiguration(Configuration configuration) { + AbstractConfiguration.setDefaultListDelimiter(','); Optional<Boolean> enabled = Optional.ofNullable( configuration.getBoolean(TIKA_ENABLED, null)); @@ -69,10 +71,10 @@ public class TikaConfigurationReader { .map(Throwing.function(Size::parse)) .map(Size::asBytes); - List<String> contentTypeBlacklist = StreamUtils + Set<String> contentTypeBlacklist = StreamUtils .ofNullable(configuration.getStringArray(TIKA_CONTENT_TYPE_BLACKLIST)) .map(String::trim) - .collect(ImmutableList.toImmutableList()); + .collect(ImmutableSet.toImmutableSet()); return TikaConfiguration.builder() .enable(enabled) http://git-wip-us.apache.org/repos/asf/james-project/blob/501d3436/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaMailboxModule.java ---------------------------------------------------------------------- diff --git a/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaMailboxModule.java b/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaMailboxModule.java index 9bea1f2..7db0d20 100644 --- a/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaMailboxModule.java +++ b/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaMailboxModule.java @@ -30,7 +30,6 @@ import org.apache.james.mailbox.extractor.TextExtractor; import org.apache.james.mailbox.store.extractor.DefaultTextExtractor; import org.apache.james.mailbox.tika.CachingTextExtractor; import org.apache.james.mailbox.tika.ContentTypeFilteringTextExtractor; -import org.apache.james.mailbox.tika.TextExtractorConfiguration; import org.apache.james.mailbox.tika.TikaConfiguration; import org.apache.james.mailbox.tika.TikaHttpClient; import org.apache.james.mailbox.tika.TikaHttpClientImpl; @@ -41,7 +40,6 @@ import org.apache.james.utils.PropertiesProvider; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.collect.ImmutableList; import com.google.inject.AbstractModule; import com.google.inject.Provides; import com.google.inject.Scopes; @@ -51,8 +49,6 @@ public class TikaMailboxModule extends AbstractModule { private static final Logger LOGGER = LoggerFactory.getLogger(TikaMailboxModule.class); private static final String TIKA_CONFIGURATION_NAME = "tika"; - private static final String TEXT_EXTRACTOR_NAME = "text_extractor"; - @Override protected void configure() { @@ -61,21 +57,6 @@ public class TikaMailboxModule extends AbstractModule { @Provides @Singleton - private TextExtractorConfiguration getTextExtractorConfiguration(PropertiesProvider propertiesProvider) throws ConfigurationException { - try { - Configuration configuration = propertiesProvider.getConfiguration(TEXT_EXTRACTOR_NAME); - - return TextExtractorConfiguration.readTextExtractorConfiguration(configuration); - } catch (FileNotFoundException e) { - LOGGER.warn("Could not find {} configuration file.", TEXT_EXTRACTOR_NAME); - return TextExtractorConfiguration.builder() - .contentTypeBlacklist(ImmutableList.of()) - .build(); - } - } - - @Provides - @Singleton protected TikaHttpClient provideTikaHttpClient(TikaConfiguration tikaConfiguration) throws URISyntaxException { return new TikaHttpClientImpl(tikaConfiguration); } @@ -97,8 +78,7 @@ public class TikaMailboxModule extends AbstractModule { @Provides @Singleton - private TextExtractor provideTextExtractor(TextExtractorConfiguration textExtractorConfiguration, - TikaTextExtractor textExtractor, TikaConfiguration configuration, + private TextExtractor provideTextExtractor(TikaTextExtractor textExtractor, TikaConfiguration configuration, MetricFactory metricFactory, GaugeRegistry gaugeRegistry) { if (configuration.isEnabled() && configuration.isCacheEnabled()) { LOGGER.info("Tika cache has been enabled."); @@ -108,10 +88,10 @@ public class TikaMailboxModule extends AbstractModule { configuration.getCacheEvictionPeriod(), configuration.getCacheWeightInBytes(), metricFactory, - gaugeRegistry), textExtractorConfiguration); + gaugeRegistry), configuration.getContentTypeBlacklist()); } if (configuration.isEnabled()) { - return new ContentTypeFilteringTextExtractor(textExtractor, textExtractorConfiguration); + return new ContentTypeFilteringTextExtractor(textExtractor, configuration.getContentTypeBlacklist()); } LOGGER.info("Tika text extraction has been disabled." + " Using DefaultTextExtractor instead. " + http://git-wip-us.apache.org/repos/asf/james-project/blob/501d3436/server/container/guice/cassandra-guice/src/test/java/org/apache/james/modules/mailbox/TikaConfigurationReaderTest.java ---------------------------------------------------------------------- diff --git a/server/container/guice/cassandra-guice/src/test/java/org/apache/james/modules/mailbox/TikaConfigurationReaderTest.java b/server/container/guice/cassandra-guice/src/test/java/org/apache/james/modules/mailbox/TikaConfigurationReaderTest.java index ab37be3..3ab5fe2 100644 --- a/server/container/guice/cassandra-guice/src/test/java/org/apache/james/modules/mailbox/TikaConfigurationReaderTest.java +++ b/server/container/guice/cassandra-guice/src/test/java/org/apache/james/modules/mailbox/TikaConfigurationReaderTest.java @@ -29,7 +29,7 @@ import org.apache.commons.configuration.PropertiesConfiguration; import org.apache.james.mailbox.tika.TikaConfiguration; import org.junit.Test; -import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; public class TikaConfigurationReaderTest { @@ -247,7 +247,7 @@ public class TikaConfigurationReaderTest { .port(889) .timeoutInMillis(500) .cacheWeightInBytes(1520000) - .contentTypeBlacklist(ImmutableList.of()) + .contentTypeBlacklist(ImmutableSet.of()) .build()); } @@ -272,7 +272,7 @@ public class TikaConfigurationReaderTest { .port(889) .timeoutInMillis(500) .cacheWeightInBytes(1520000) - .contentTypeBlacklist(ImmutableList.of("application/ics", "application/zip")) + .contentTypeBlacklist(ImmutableSet.of("application/ics", "application/zip")) .build()); } @@ -297,7 +297,7 @@ public class TikaConfigurationReaderTest { .port(889) .timeoutInMillis(500) .cacheWeightInBytes(1520000) - .contentTypeBlacklist(ImmutableList.of("application/ics", "application/zip")) + .contentTypeBlacklist(ImmutableSet.of("application/ics", "application/zip")) .build()); } } \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: server-dev-unsubscr...@james.apache.org For additional commands, e-mail: server-dev-h...@james.apache.org