This is an automated email from the ASF dual-hosted git repository. btellier pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/james-project.git
The following commit(s) were added to refs/heads/master by this push: new 4181de38b2 JAMES-2456 Add a reactor timeout for Tika requests (#2437) 4181de38b2 is described below commit 4181de38b23cad9a12c64498ea5db49427453e5e Author: Trần Hồng Quân <55171818+quantranhong1...@users.noreply.github.com> AuthorDate: Mon Oct 7 19:13:02 2024 +0700 JAMES-2456 Add a reactor timeout for Tika requests (#2437) --- docs/modules/servers/partials/configure/tika.adoc | 2 +- .../json/MessageToOpenSearchJsonTest.java | 27 +++++++ .../nonTextualWithoutAttachmentTextContent.json | 83 ++++++++++++++++++++++ .../james/mailbox/tika/TikaHttpClientImpl.java | 3 +- .../apache/james/mailbox/tika/TikaContainer.java | 6 +- .../james/mailbox/tika/TikaTextExtractorTest.java | 17 +++++ 6 files changed, 132 insertions(+), 6 deletions(-) diff --git a/docs/modules/servers/partials/configure/tika.adoc b/docs/modules/servers/partials/configure/tika.adoc index 4e2ae16662..c12b693c34 100644 --- a/docs/modules/servers/partials/configure/tika.adoc +++ b/docs/modules/servers/partials/configure/tika.adoc @@ -26,7 +26,7 @@ Defaults to false. | Port of your tika server. The default value is 9998 | tika.timeoutInMillis -| Timeout when issuing request to the tika server. The default value is 3 seconds. +| Timeout when issuing request to the tika server. The default value is 30 seconds. | tika.cache.eviction.period | A cache is used to avoid, when possible, query Tika multiple time for the same attachments. diff --git a/mailbox/opensearch/src/test/java/org/apache/james/mailbox/opensearch/json/MessageToOpenSearchJsonTest.java b/mailbox/opensearch/src/test/java/org/apache/james/mailbox/opensearch/json/MessageToOpenSearchJsonTest.java index 3ef2e058d1..0908728920 100644 --- a/mailbox/opensearch/src/test/java/org/apache/james/mailbox/opensearch/json/MessageToOpenSearchJsonTest.java +++ b/mailbox/opensearch/src/test/java/org/apache/james/mailbox/opensearch/json/MessageToOpenSearchJsonTest.java @@ -484,6 +484,33 @@ class MessageToOpenSearchJsonTest { ClassLoaderUtils.getSystemResourceAsString("eml/nonTextual.json", StandardCharsets.UTF_8)); } + @Test + void convertMessageShouldSucceedWithAttachmentTextContentBeNullWhenTikaReadTimeout() throws Exception { + textExtractor = new TikaTextExtractor(new RecordingMetricFactory(), new TikaHttpClientImpl(TikaConfiguration.builder() + .host(tika.getIp()) + .port(tika.getPort()) + .timeoutInMillis(1) + .build())); + + MessageToOpenSearchJson messageToOpenSearchJson = new MessageToOpenSearchJson( + textExtractor, + ZoneId.of("Europe/Paris"), + IndexAttachments.YES, IndexHeaders.YES); + MailboxMessage spamMail = new SimpleMailboxMessage(MESSAGE_ID, THREAD_ID, date, + SIZE, + BODY_START_OCTET, + new ByteContent(IOUtils.toByteArray(ClassLoaderUtils.getSystemResourceAsSharedStream("eml/nonTextual.eml"))), + new Flags(), + propertyBuilder.build(), + MAILBOX_ID); + spamMail.setUid(UID); + spamMail.setModSeq(MOD_SEQ); + + assertThatJson(messageToOpenSearchJson.convertToJson(spamMail).block()) + .when(IGNORING_ARRAY_ORDER) + .isEqualTo(ClassLoaderUtils.getSystemResourceAsString("eml/nonTextualWithoutAttachmentTextContent.json", StandardCharsets.UTF_8)); + } + @Test void convertToJsonWithoutAttachmentShouldConvertEmailBoby() throws IOException { // Given diff --git a/mailbox/store/src/test/resources/eml/nonTextualWithoutAttachmentTextContent.json b/mailbox/store/src/test/resources/eml/nonTextualWithoutAttachmentTextContent.json new file mode 100644 index 0000000000..cf5be90ccb --- /dev/null +++ b/mailbox/store/src/test/resources/eml/nonTextualWithoutAttachmentTextContent.json @@ -0,0 +1,83 @@ +{ + "attachments": [ + { + "mediaType":"application", + "subtype":"vnd.oasis.opendocument.text", + "fileName":"toto.odt","fileExtension":"odt", + "contentDisposition":"attachment", + "textContent": null + } + ], + "bcc":[], + "htmlBody":null, + "textBody":"This mail have a non textual attachment !\r\n", + "cc":[], + "date":"2015-06-07T00:00:00+0200", + "saveDate": null, + "from":[{ + "name":"Benoit Tellier", + "address":"btell...@linagora.com", + "domain":"linagora" + }], + "hasAttachment":false, + "headers": [{ + "name": "return-path", + "value": "<btell...@linagora.com>" + }, { + "name": "received", + "value": "from alderaan.linagora.com (smtp.linagora.dc1 [172.16.18.53])\t by imap (Cyrus v2.2.13-Debian-2.2.13-19+squeeze3) with LMTPA;\t Thu, 18 Jun 2015 12:43:28 +0200" + }, { + "name": "x-sieve", + "value": "CMU Sieve 2.2" + }, { + "name": "received", + "value": "from [10.75.9.154] (unknown [92.103.166.6])\t(using TLSv1 with cipher DHE-RSA-AES128-SHA (128/128 bits))\t(No client certificate requested)\tby alderaan.linagora.com (Postfix) with ESMTPSA id 0EB1078A\tfor <btell...@linagora.com>; Thu, 18 Jun 2015 12:43:28 +0200 (CEST)" + }, { + "name": "to", + "value": "btell...@linagora.com" + }, { + "name": "from", + "value": "Benoit Tellier <btell...@linagora.com>" + }, { + "name": "subject", + "value": "Test message" + }, { + "name": "message-id", + "value": "<5582a0ce.4020...@linagora.com>" + }, { + "name": "date", + "value": "Thu, 18 Jun 2015 12:43:26 +0200" + }, { + "name": "user-agent", + "value": "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.0.1" + }, { + "name": "mime-version", + "value": "1.0" + }, { + "name": "content-type", + "value": "multipart/mixed; boundary=\"------------030000010109090603040500\"" + }], + "mailboxId":"18", + "mediaType":"plain", + "messageId":"184", + "threadId": "184", + "modSeq":42, + "sentDate":"2015-06-18T12:43:26+0200", + "size":25, + "subject":["Test message"], + "subtype":"text", + "to":[{ + "name":null, + "address":"btell...@linagora.com", + "domain":"linagora" + }], + "uid":25, + "userFlags":[], + "mimeMessageID":"<5582a0ce.4020...@linagora.com>", + "isAnswered":false, + "isDeleted":false, + "isDraft":false, + "isFlagged":false, + "isRecent":false, + "isUnread":true +} diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java index a3227cbf6a..ca7f1a4ce9 100644 --- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java +++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaHttpClientImpl.java @@ -83,8 +83,9 @@ public class TikaHttpClientImpl implements TikaHttpClient { return Mono.empty(); } }) + .timeout(Duration.ofMillis(tikaConfiguration.getTimeoutInMillis())) .onErrorResume(e -> { - LOGGER.warn("Failing to call Tika for content type {}", contentType, e); + LOGGER.warn("Failing to call Tika for content type '{}'", contentType.asString(), e); return Mono.empty(); }); } diff --git a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaContainer.java b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaContainer.java index b0e6a42346..fe9a66a739 100644 --- a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaContainer.java +++ b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaContainer.java @@ -18,9 +18,10 @@ ****************************************************************/ package org.apache.james.mailbox.tika; +import static org.apache.james.mailbox.tika.TikaConfiguration.DEFAULT_TIMEOUT_IN_MS; + import java.time.Duration; import java.util.UUID; -import java.util.concurrent.TimeUnit; import org.apache.james.util.docker.DockerContainer; import org.apache.james.util.docker.Images; @@ -28,12 +29,9 @@ import org.apache.james.util.docker.RateLimiters; import org.junit.rules.ExternalResource; import org.testcontainers.containers.wait.strategy.Wait; -import com.google.common.primitives.Ints; - public class TikaContainer extends ExternalResource { private static final int DEFAULT_TIKA_PORT = 9998; - private static final int DEFAULT_TIMEOUT_IN_MS = Ints.checkedCast(TimeUnit.MINUTES.toMillis(3)); private final DockerContainer tika; diff --git a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java index 6c0f76ac0f..150859fd13 100644 --- a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java +++ b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java @@ -26,6 +26,7 @@ import java.io.ByteArrayInputStream; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.List; +import java.util.concurrent.TimeoutException; import org.apache.commons.io.IOUtils; import org.apache.james.mailbox.extractor.ParsedContent; @@ -76,6 +77,22 @@ class TikaTextExtractorTest { .contains("This is some awesome text text."); } + @Test + void shouldReturnEmptyParsedContentWhenTimeoutTriggered() throws Exception { + textExtractor = new TikaTextExtractor(new RecordingMetricFactory(), new TikaHttpClientImpl(TikaConfiguration.builder() + .host(tika.getIp()) + .port(tika.getPort()) + .timeoutInMillis(1) + .build())); + + InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/writter.docx"); + + assertThat(textExtractor.extractContentReactive(inputStream, + ContentType.of("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) + .block()) + .isEqualTo(ParsedContent.empty()); + } + @Test void textMicrosoftWorldTest() throws Exception { InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/writter.docx"); --------------------------------------------------------------------- To unsubscribe, e-mail: notifications-unsubscr...@james.apache.org For additional commands, e-mail: notifications-h...@james.apache.org