Repository: james-project Updated Branches: refs/heads/master 16eebbbb6 -> cf2febf0c
JAMES-2581 Add contentType blacklist for text extractor Project: http://git-wip-us.apache.org/repos/asf/james-project/repo Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/d91fe698 Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/d91fe698 Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/d91fe698 Branch: refs/heads/master Commit: d91fe6987fb6d1e7ff48c01a7d584bfb7c191dc8 Parents: 16eebbb Author: datph <dphamho...@linagora.com> Authored: Tue Nov 6 11:34:01 2018 +0700 Committer: datph <dphamho...@linagora.com> Committed: Fri Nov 9 16:28:01 2018 +0700 ---------------------------------------------------------------------- .../james/mailbox/extractor/ParsedContent.java | 24 ++++- .../mailbox/extractor/ParsedContentTest.java | 33 +++++++ mailbox/tika/pom.xml | 5 ++ .../tika/ContentTypeFilteringTextExtractor.java | 53 +++++++++++ .../tika/TextExtractorConfiguration.java | 94 ++++++++++++++++++++ .../ContentTypeFilteringTextExtractorTest.java | 76 ++++++++++++++++ .../tika/TextExtractorConfigurationTest.java | 60 +++++++++++++ .../modules/mailbox/TikaMailboxModule.java | 37 ++++++-- 8 files changed, 373 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/james-project/blob/d91fe698/mailbox/api/src/main/java/org/apache/james/mailbox/extractor/ParsedContent.java ---------------------------------------------------------------------- diff --git a/mailbox/api/src/main/java/org/apache/james/mailbox/extractor/ParsedContent.java b/mailbox/api/src/main/java/org/apache/james/mailbox/extractor/ParsedContent.java index 8ba90ab..4be6646 100644 --- a/mailbox/api/src/main/java/org/apache/james/mailbox/extractor/ParsedContent.java +++ b/mailbox/api/src/main/java/org/apache/james/mailbox/extractor/ParsedContent.java @@ -21,8 +21,11 @@ package org.apache.james.mailbox.extractor; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; +import com.google.common.collect.ImmutableMap; + public class ParsedContent { private final Optional<String> textualContent; @@ -40,5 +43,24 @@ public class ParsedContent { public Map<String, List<String>> getMetadata() { return metadata; } - + + public static ParsedContent empty() { + return new ParsedContent(Optional.empty(), ImmutableMap.of()); + } + + @Override + public final boolean equals(Object o) { + if (o instanceof ParsedContent) { + ParsedContent that = (ParsedContent) o; + + return Objects.equals(this.textualContent, that.textualContent) + && Objects.equals(this.metadata, that.metadata); + } + return false; + } + + @Override + public final int hashCode() { + return Objects.hash(textualContent, metadata); + } } http://git-wip-us.apache.org/repos/asf/james-project/blob/d91fe698/mailbox/api/src/test/java/org/apache/james/mailbox/extractor/ParsedContentTest.java ---------------------------------------------------------------------- diff --git a/mailbox/api/src/test/java/org/apache/james/mailbox/extractor/ParsedContentTest.java b/mailbox/api/src/test/java/org/apache/james/mailbox/extractor/ParsedContentTest.java new file mode 100644 index 0000000..be65e10 --- /dev/null +++ b/mailbox/api/src/test/java/org/apache/james/mailbox/extractor/ParsedContentTest.java @@ -0,0 +1,33 @@ +/**************************************************************** + * Licensed to the Apache Software Foundation (ASF) under one * + * or more contributor license agreements. See the NOTICE file * + * distributed with this work for additional information * + * regarding copyright ownership. The ASF licenses this file * + * to you under the Apache License, Version 2.0 (the * + * "License"); you may not use this file except in compliance * + * with the License. You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, * + * software distributed under the License is distributed on an * + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * + * KIND, either express or implied. See the License for the * + * specific language governing permissions and limitations * + * under the License. * + ****************************************************************/ + +package org.apache.james.mailbox.extractor; + +import org.junit.Test; + +import nl.jqno.equalsverifier.EqualsVerifier; + +public class ParsedContentTest { + + @Test + public void shouldMatchBeanContract() { + EqualsVerifier.forClass(ParsedContent.class) + .verify(); + } +} http://git-wip-us.apache.org/repos/asf/james-project/blob/d91fe698/mailbox/tika/pom.xml ---------------------------------------------------------------------- diff --git a/mailbox/tika/pom.xml b/mailbox/tika/pom.xml index 4564b56..5e53ecc 100644 --- a/mailbox/tika/pom.xml +++ b/mailbox/tika/pom.xml @@ -71,6 +71,11 @@ <artifactId>commons-configuration</artifactId> </dependency> <dependency> + <groupId>nl.jqno.equalsverifier</groupId> + <artifactId>equalsverifier</artifactId> + <scope>test</scope> + </dependency> + <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> </dependency> http://git-wip-us.apache.org/repos/asf/james-project/blob/d91fe698/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractor.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractor.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractor.java new file mode 100644 index 0000000..9b18a2f --- /dev/null +++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractor.java @@ -0,0 +1,53 @@ +/**************************************************************** + * Licensed to the Apache Software Foundation (ASF) under one * + * or more contributor license agreements. See the NOTICE file * + * distributed with this work for additional information * + * regarding copyright ownership. The ASF licenses this file * + * to you under the Apache License, Version 2.0 (the * + * "License"); you may not use this file except in compliance * + * with the License. You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, * + * software distributed under the License is distributed on an * + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * + * KIND, either express or implied. See the License for the * + * specific language governing permissions and limitations * + * under the License. * + ****************************************************************/ + +package org.apache.james.mailbox.tika; + +import java.io.InputStream; +import java.util.Objects; + +import org.apache.james.mailbox.extractor.ParsedContent; +import org.apache.james.mailbox.extractor.TextExtractor; + +public class ContentTypeFilteringTextExtractor implements TextExtractor { + + private final TextExtractor textExtractor; + private final TextExtractorConfiguration textExtractorConfiguration; + + public ContentTypeFilteringTextExtractor(TextExtractor textExtractor, TextExtractorConfiguration textExtractorConfiguration) { + this.textExtractor = textExtractor; + this.textExtractorConfiguration = textExtractorConfiguration; + } + + @Override + public ParsedContent extractContent(InputStream inputStream, String contentType) throws Exception { + if (isBlacklisted(contentType)) { + return ParsedContent.empty(); + } + return textExtractor.extractContent(inputStream, contentType); + } + + private boolean isBlacklisted(String contentType) { + return textExtractorConfiguration + .getContentTypeBlacklist() + .stream() + .anyMatch(blackListItem -> Objects.equals(blackListItem, contentType)); + } + +} http://git-wip-us.apache.org/repos/asf/james-project/blob/d91fe698/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TextExtractorConfiguration.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TextExtractorConfiguration.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TextExtractorConfiguration.java new file mode 100644 index 0000000..df5400a --- /dev/null +++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TextExtractorConfiguration.java @@ -0,0 +1,94 @@ +/**************************************************************** + * Licensed to the Apache Software Foundation (ASF) under one * + * or more contributor license agreements. See the NOTICE file * + * distributed with this work for additional information * + * regarding copyright ownership. The ASF licenses this file * + * to you under the Apache License, Version 2.0 (the * + * "License"); you may not use this file except in compliance * + * with the License. You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, * + * software distributed under the License is distributed on an * + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * + * KIND, either express or implied. See the License for the * + * specific language governing permissions and limitations * + * under the License. * + ****************************************************************/ + +package org.apache.james.mailbox.tika; + +import java.util.List; +import java.util.Objects; + +import org.apache.commons.configuration.AbstractConfiguration; +import org.apache.commons.configuration.Configuration; +import org.apache.james.util.StreamUtils; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +public class TextExtractorConfiguration { + + private static final String TEXT_EXTRACTOR_CONTENT_TYPE_BLACKLIST = "textextractor.contentType.blacklist"; + + public static class Builder { + private ImmutableList.Builder<String> contentTypeBlacklist; + + public Builder contentTypeBlacklist(List<String> contentTypeBlacklist) { + Preconditions.checkNotNull(contentTypeBlacklist); + this.contentTypeBlacklist.addAll(contentTypeBlacklist); + return this; + } + + private Builder() { + contentTypeBlacklist = ImmutableList.builder(); + } + + public TextExtractorConfiguration build() { + return new TextExtractorConfiguration(contentTypeBlacklist.build()); + } + } + + public static Builder builder() { + return new Builder(); + } + + private final ImmutableList<String> contentTypeBlacklist; + + public TextExtractorConfiguration(ImmutableList<String> contentTypeBlacklist) { + this.contentTypeBlacklist = contentTypeBlacklist; + } + + public ImmutableList<String> getContentTypeBlacklist() { + return contentTypeBlacklist; + } + + public static TextExtractorConfiguration readTextExtractorConfiguration(Configuration configuration) { + AbstractConfiguration.setDefaultListDelimiter(','); + + List<String> contentTypeBlacklist = StreamUtils + .ofNullable(configuration.getStringArray(TEXT_EXTRACTOR_CONTENT_TYPE_BLACKLIST)) + .map(String::trim) + .collect(ImmutableList.toImmutableList()); + + return TextExtractorConfiguration.builder() + .contentTypeBlacklist(contentTypeBlacklist) + .build(); + } + + @Override + public final boolean equals(Object o) { + if (o instanceof TextExtractorConfiguration) { + TextExtractorConfiguration that = (TextExtractorConfiguration) o; + return Objects.equals(this.contentTypeBlacklist, that.contentTypeBlacklist); + } + return false; + } + + @Override + public final int hashCode() { + return Objects.hash(contentTypeBlacklist); + } +} http://git-wip-us.apache.org/repos/asf/james-project/blob/d91fe698/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractorTest.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractorTest.java b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractorTest.java new file mode 100644 index 0000000..9f9c2cd --- /dev/null +++ b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/ContentTypeFilteringTextExtractorTest.java @@ -0,0 +1,76 @@ +/**************************************************************** + * Licensed to the Apache Software Foundation (ASF) under one * + * or more contributor license agreements. See the NOTICE file * + * distributed with this work for additional information * + * regarding copyright ownership. The ASF licenses this file * + * to you under the Apache License, Version 2.0 (the * + * "License"); you may not use this file except in compliance * + * with the License. You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, * + * software distributed under the License is distributed on an * + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * + * KIND, either express or implied. See the License for the * + * specific language governing permissions and limitations * + * under the License. * + ****************************************************************/ + +package org.apache.james.mailbox.tika; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoMoreInteractions; + +import java.io.InputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.james.mailbox.extractor.ParsedContent; +import org.apache.james.mailbox.extractor.TextExtractor; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import com.google.common.collect.ImmutableList; + +public class ContentTypeFilteringTextExtractorTest { + + @Mock + TextExtractor textExtractor; + + @Before + public void setUp() { + MockitoAnnotations.initMocks(this); + } + + @Test + public void extractContentReturnEmptyWithContentTypeInBlacklist() throws Exception { + ContentTypeFilteringTextExtractor contentTypeFilteringTextExtractor = + new ContentTypeFilteringTextExtractor(textExtractor, + TextExtractorConfiguration.builder() + .contentTypeBlacklist(ImmutableList.of("application/ics", "application/zip")) + .build()); + + assertThat(contentTypeFilteringTextExtractor + .extractContent(IOUtils.toInputStream(""), "application/ics")) + .isEqualTo(ParsedContent.empty()); + verifyNoMoreInteractions(textExtractor); + } + + @Test + public void extractContentCallUnderlyingWithContentTypeNotInBlacklist() throws Exception { + InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/Text.txt"); + ContentTypeFilteringTextExtractor contentTypeFilteringTextExtractor = + new ContentTypeFilteringTextExtractor(textExtractor, + TextExtractorConfiguration.builder() + .contentTypeBlacklist(ImmutableList.of("application/ics", "application/zip")) + .build()); + contentTypeFilteringTextExtractor.extractContent(inputStream, "text/plain"); + + verify(textExtractor, times(1)).extractContent(any(), any()); + } +} http://git-wip-us.apache.org/repos/asf/james-project/blob/d91fe698/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TextExtractorConfigurationTest.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TextExtractorConfigurationTest.java b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TextExtractorConfigurationTest.java new file mode 100644 index 0000000..15c0135 --- /dev/null +++ b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TextExtractorConfigurationTest.java @@ -0,0 +1,60 @@ +/**************************************************************** + * Licensed to the Apache Software Foundation (ASF) under one * + * or more contributor license agreements. See the NOTICE file * + * distributed with this work for additional information * + * regarding copyright ownership. The ASF licenses this file * + * to you under the Apache License, Version 2.0 (the * + * "License"); you may not use this file except in compliance * + * with the License. You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, * + * software distributed under the License is distributed on an * + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * + * KIND, either express or implied. See the License for the * + * specific language governing permissions and limitations * + * under the License. * + ****************************************************************/ + +package org.apache.james.mailbox.tika; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.StringReader; + +import org.apache.commons.configuration.ConfigurationException; +import org.apache.commons.configuration.PropertiesConfiguration; +import org.junit.Test; + +import com.google.common.collect.ImmutableList; + +import nl.jqno.equalsverifier.EqualsVerifier; + +public class TextExtractorConfigurationTest { + + @Test + public void shouldMatchBeanContract() { + EqualsVerifier.forClass(TextExtractorConfiguration.class) + .verify(); + } + + @Test + public void readTextExtractorConfigurationReturnEmptyWithNoBlacklist() { + PropertiesConfiguration configuration = new PropertiesConfiguration(); + + assertThat(TextExtractorConfiguration.readTextExtractorConfiguration(configuration)) + .isEqualTo(new TextExtractorConfiguration(ImmutableList.of())); + } + + @Test + public void readTextExtractorConfigurationReturnConfigurationWithBlacklist() throws ConfigurationException { + PropertiesConfiguration configuration = new PropertiesConfiguration(); + configuration.load(new StringReader("textextractor.contentType.blacklist=application/ics, application/zip")); + + assertThat(TextExtractorConfiguration.readTextExtractorConfiguration(configuration)) + .isEqualTo(TextExtractorConfiguration.builder() + .contentTypeBlacklist(ImmutableList.of("application/ics", "application/zip")) + .build()); + } +} http://git-wip-us.apache.org/repos/asf/james-project/blob/d91fe698/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaMailboxModule.java ---------------------------------------------------------------------- diff --git a/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaMailboxModule.java b/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaMailboxModule.java index 3dc33a2..9bea1f2 100644 --- a/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaMailboxModule.java +++ b/server/container/guice/cassandra-guice/src/main/java/org/apache/james/modules/mailbox/TikaMailboxModule.java @@ -29,6 +29,8 @@ import org.apache.commons.configuration.ConfigurationException; import org.apache.james.mailbox.extractor.TextExtractor; import org.apache.james.mailbox.store.extractor.DefaultTextExtractor; import org.apache.james.mailbox.tika.CachingTextExtractor; +import org.apache.james.mailbox.tika.ContentTypeFilteringTextExtractor; +import org.apache.james.mailbox.tika.TextExtractorConfiguration; import org.apache.james.mailbox.tika.TikaConfiguration; import org.apache.james.mailbox.tika.TikaHttpClient; import org.apache.james.mailbox.tika.TikaHttpClientImpl; @@ -39,6 +41,7 @@ import org.apache.james.utils.PropertiesProvider; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.collect.ImmutableList; import com.google.inject.AbstractModule; import com.google.inject.Provides; import com.google.inject.Scopes; @@ -48,6 +51,7 @@ public class TikaMailboxModule extends AbstractModule { private static final Logger LOGGER = LoggerFactory.getLogger(TikaMailboxModule.class); private static final String TIKA_CONFIGURATION_NAME = "tika"; + private static final String TEXT_EXTRACTOR_NAME = "text_extractor"; @Override @@ -57,6 +61,21 @@ public class TikaMailboxModule extends AbstractModule { @Provides @Singleton + private TextExtractorConfiguration getTextExtractorConfiguration(PropertiesProvider propertiesProvider) throws ConfigurationException { + try { + Configuration configuration = propertiesProvider.getConfiguration(TEXT_EXTRACTOR_NAME); + + return TextExtractorConfiguration.readTextExtractorConfiguration(configuration); + } catch (FileNotFoundException e) { + LOGGER.warn("Could not find {} configuration file.", TEXT_EXTRACTOR_NAME); + return TextExtractorConfiguration.builder() + .contentTypeBlacklist(ImmutableList.of()) + .build(); + } + } + + @Provides + @Singleton protected TikaHttpClient provideTikaHttpClient(TikaConfiguration tikaConfiguration) throws URISyntaxException { return new TikaHttpClientImpl(tikaConfiguration); } @@ -78,19 +97,21 @@ public class TikaMailboxModule extends AbstractModule { @Provides @Singleton - private TextExtractor provideTextExtractor(TikaTextExtractor textExtractor, TikaConfiguration configuration, + private TextExtractor provideTextExtractor(TextExtractorConfiguration textExtractorConfiguration, + TikaTextExtractor textExtractor, TikaConfiguration configuration, MetricFactory metricFactory, GaugeRegistry gaugeRegistry) { if (configuration.isEnabled() && configuration.isCacheEnabled()) { LOGGER.info("Tika cache has been enabled."); - return new CachingTextExtractor( - textExtractor, - configuration.getCacheEvictionPeriod(), - configuration.getCacheWeightInBytes(), - metricFactory, - gaugeRegistry); + return new ContentTypeFilteringTextExtractor( + new CachingTextExtractor( + textExtractor, + configuration.getCacheEvictionPeriod(), + configuration.getCacheWeightInBytes(), + metricFactory, + gaugeRegistry), textExtractorConfiguration); } if (configuration.isEnabled()) { - return textExtractor; + return new ContentTypeFilteringTextExtractor(textExtractor, textExtractorConfiguration); } LOGGER.info("Tika text extraction has been disabled." + " Using DefaultTextExtractor instead. " + --------------------------------------------------------------------- To unsubscribe, e-mail: server-dev-unsubscr...@james.apache.org For additional commands, e-mail: server-dev-h...@james.apache.org