JAMES-2456 Upgrade Tika / Tika client should not throw
Project: http://git-wip-us.apache.org/repos/asf/james-project/repo Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/c8bd682a Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/c8bd682a Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/c8bd682a Branch: refs/heads/master Commit: c8bd682adfadeb3b68c2da7624f44c5d25d09b7f Parents: 6ef1a29 Author: Antoine Duprat <[email protected]> Authored: Fri Jul 6 08:45:36 2018 +0200 Committer: benwa <[email protected]> Committed: Wed Jul 11 10:29:00 2018 +0700 ---------------------------------------------------------------------- .../james/mailbox/tika/TikaException.java | 26 -------------------- .../james/mailbox/tika/TikaHttpClient.java | 3 ++- .../james/mailbox/tika/TikaHttpClientImpl.java | 20 ++++++++------- .../james/mailbox/tika/TikaTextExtractor.java | 26 ++++++++++++++------ .../mailbox/tika/TikaTextExtractorTest.java | 17 +++++++++---- .../org/apache/james/util/docker/Images.java | 2 +- 6 files changed, 45 insertions(+), 49 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaException.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaException.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaException.java deleted file mode 100644 index ecdc742..0000000 --- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaException.java +++ /dev/null @@ -1,26 +0,0 @@ -/**************************************************************** - * Licensed to the Apache Software Foundation (ASF) under one * - * or more contributor license agreements. See the NOTICE file * - * distributed with this work for additional information * - * regarding copyright ownership. The ASF licenses this file * - * to you under the Apache License, Version 2.0 (the * - * "License"); you may not use this file except in compliance * - * with the License. You may obtain a copy of the License at * - * * - * http://www.apache.org/licenses/LICENSE-2.0 * - * * - * Unless required by applicable law or agreed to in writing, * - * software distributed under the License is distributed on an * - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * - * KIND, either express or implied. See the License for the * - * specific language governing permissions and limitations * - * under the License. * - ****************************************************************/ -package org.apache.james.mailbox.tika; - -public class TikaException extends RuntimeException { - - public TikaException(Exception exception) { - super(exception); - } -} http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java index e736d72..9e490db 100644 --- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java +++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClient.java @@ -19,8 +19,9 @@ package org.apache.james.mailbox.tika; import java.io.InputStream; +import java.util.Optional; public interface TikaHttpClient { - InputStream recursiveMetaDataAsJson(InputStream inputStream, String contentType) throws TikaException; + Optional<InputStream> recursiveMetaDataAsJson(InputStream inputStream, String contentType); } http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java index a8d9df4..32ee7e6 100644 --- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java +++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.net.URISyntaxException; +import java.util.Optional; import org.apache.http.client.fluent.Request; import org.apache.http.client.utils.URIBuilder; @@ -51,17 +52,18 @@ public class TikaHttpClientImpl implements TikaHttpClient { } @Override - public InputStream recursiveMetaDataAsJson(InputStream inputStream, String contentType) throws TikaException { + public Optional<InputStream> recursiveMetaDataAsJson(InputStream inputStream, String contentType) { try { - return Request.Put(recursiveMetaData) - .socketTimeout(tikaConfiguration.getTimeoutInMillis()) - .bodyStream(inputStream, ContentType.create(contentType)) - .execute() - .returnContent() - .asStream(); + return Optional.ofNullable( + Request.Put(recursiveMetaData) + .socketTimeout(tikaConfiguration.getTimeoutInMillis()) + .bodyStream(inputStream, ContentType.create(contentType)) + .execute() + .returnContent() + .asStream()); } catch (IOException e) { - LOGGER.error("Failing to call Tika", e); - throw new TikaException(e); + LOGGER.warn("Failing to call Tika", e); + return Optional.empty(); } } http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java index 955647e..305e2a1 100644 --- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java +++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Objects; +import java.util.Optional; import java.util.function.Predicate; import javax.inject.Inject; @@ -51,6 +52,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.MoreObjects; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; public class TikaTextExtractor implements TextExtractor { @@ -82,11 +84,13 @@ public class TikaTextExtractor implements TextExtractor { public ParsedContent performContentExtraction(InputStream inputStream, String contentType) throws IOException { ContentAndMetadata contentAndMetadata = convert(tikaHttpClient.recursiveMetaDataAsJson(inputStream, contentType)); - return new ParsedContent(contentAndMetadata.getContent(), contentAndMetadata.getMetadata()); + return new ParsedContent(contentAndMetadata.getContent().orElse(null), contentAndMetadata.getMetadata()); } - private ContentAndMetadata convert(InputStream json) throws IOException, JsonParseException, JsonMappingException { - return objectMapper.readValue(json, ContentAndMetadata.class); + private ContentAndMetadata convert(Optional<InputStream> maybeInputStream) throws IOException, JsonParseException, JsonMappingException { + return maybeInputStream + .map(Throwing.function(inputStream -> objectMapper.readValue(inputStream, ContentAndMetadata.class))) + .orElse(ContentAndMetadata.empty()); } @VisibleForTesting @@ -119,8 +123,12 @@ public class TikaTextExtractor implements TextExtractor { private static final String TIKA_HEADER = "X-TIKA"; private static final String CONTENT_METADATA_HEADER_NAME = TIKA_HEADER + ":content"; + public static ContentAndMetadata empty() { + return new ContentAndMetadata(); + } + public static ContentAndMetadata from(Map<String, List<String>> contentAndMetadataMap) { - return new ContentAndMetadata(content(contentAndMetadataMap), + return new ContentAndMetadata(Optional.ofNullable(content(contentAndMetadataMap)), contentAndMetadataMap.entrySet().stream() .filter(allHeadersButTika()) .collect(Guavate.toImmutableMap(Entry::getKey, Entry::getValue))); @@ -139,15 +147,19 @@ public class TikaTextExtractor implements TextExtractor { return StringUtils.stripStart(content.get(0), onlySpaces); } - private final String content; + private final Optional<String> content; private final Map<String, List<String>> metadata; - private ContentAndMetadata(String content, Map<String, List<String>> metadata) { + private ContentAndMetadata() { + this(Optional.empty(), ImmutableMap.of()); + } + + private ContentAndMetadata(Optional<String> content, Map<String, List<String>> metadata) { this.content = content; this.metadata = metadata; } - public String getContent() { + public Optional<String> getContent() { return content; } http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java ---------------------------------------------------------------------- diff --git a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java index 455a275..a78821f 100644 --- a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java +++ b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java @@ -27,6 +27,7 @@ import java.io.ByteArrayInputStream; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.List; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.james.mailbox.extractor.ParsedContent; @@ -157,8 +158,10 @@ public class TikaTextExtractorTest { @Test public void deserializerShouldNotThrowWhenMoreThanOneNode() throws Exception { TikaTextExtractor textExtractor = new TikaTextExtractor( - new NoopMetricFactory(), (inputStream, contentType) -> new ByteArrayInputStream(("[{\"X-TIKA:content\": \"This is an awesome LibreOffice document !\"}, " + - "{\"Chroma BlackIsZero\": \"true\"}]").getBytes(StandardCharsets.UTF_8))); + new NoopMetricFactory(), + (inputStream, contentType) -> Optional.of(new ByteArrayInputStream(("[{\"X-TIKA:content\": \"This is an awesome LibreOffice document !\"}, " + + "{\"Chroma BlackIsZero\": \"true\"}]") + .getBytes(StandardCharsets.UTF_8)))); InputStream inputStream = null; textExtractor.extractContent(inputStream, "text/plain"); @@ -168,8 +171,10 @@ public class TikaTextExtractorTest { public void deserializerShouldTakeFirstNodeWhenSeveral() throws Exception { String expectedExtractedContent = "content A"; TikaTextExtractor textExtractor = new TikaTextExtractor( - new NoopMetricFactory(), (inputStream, contentType) -> new ByteArrayInputStream(("[{\"X-TIKA:content\": \"" + expectedExtractedContent + "\"}, " + - "{\"X-TIKA:content\": \"content B\"}]").getBytes(StandardCharsets.UTF_8))); + new NoopMetricFactory(), + (inputStream, contentType) -> Optional.of(new ByteArrayInputStream(("[{\"X-TIKA:content\": \"" + expectedExtractedContent + "\"}, " + + "{\"X-TIKA:content\": \"content B\"}]") + .getBytes(StandardCharsets.UTF_8)))); InputStream inputStream = null; ParsedContent parsedContent = textExtractor.extractContent(inputStream, "text/plain"); @@ -183,7 +188,9 @@ public class TikaTextExtractorTest { expectedException.expectMessage("The element should be a Json object"); TikaTextExtractor textExtractor = new TikaTextExtractor( - new NoopMetricFactory(), (inputStream, contentType) -> new ByteArrayInputStream("[\"value1\"]".getBytes(StandardCharsets.UTF_8))); + new NoopMetricFactory(), + (inputStream, contentType) -> Optional.of(new ByteArrayInputStream("[\"value1\"]" + .getBytes(StandardCharsets.UTF_8)))); InputStream inputStream = null; textExtractor.extractContent(inputStream, "text/plain"); http://git-wip-us.apache.org/repos/asf/james-project/blob/c8bd682a/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java ---------------------------------------------------------------------- diff --git a/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java b/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java index 549cf02..d0da2aa 100644 --- a/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java +++ b/server/container/util-java8/src/test/java/org/apache/james/util/docker/Images.java @@ -24,6 +24,6 @@ public interface Images { String RABBITMQ = "rabbitmq:3.7.5"; String ELASTICSEARCH = "elasticsearch:2.2.2"; String NGINX = "nginx:1.7.1"; - String TIKA = "logicalspark/docker-tikaserver:1.15rc2"; + String TIKA = "linagora/docker-tikaserver:1.18-SNAPSHOT-plus-TIKA-2520"; String SPAMASSASSIN = "dinkel/spamassassin:3.4.0"; } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
