[
https://issues.apache.org/jira/browse/TIKA-4674?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18061462#comment-18061462
]
Hudson commented on TIKA-4674:
------------------------------
SUCCESS: Integrated in Jenkins build Tika ยป tika-main-jdk17 #1225 (See
[https://ci-builds.apache.org/job/Tika/job/tika-main-jdk17/1225/])
TIKA-4674 - progress timeout (#2650) (github:
[https://github.com/apache/tika/commit/3b53d0d1dd3a4ae8d77127643479371e20044814])
* (edit) tika-e2e-tests/tika-grpc/sample-configs/grobid/tika-config.json
* (edit)
tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/config/ConfigMergerTest.java
* (edit)
tika-server/tika-server-core/src/test/resources/configs/tika-config-server-tls-two-way-template.json
* (edit) docs/modules/ROOT/pages/pipes/shared-server-mode.adoc
* (edit)
tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/config/ConfigMerger.java
* (edit) tika-serialization/src/test/resources/configs/timeout-limits-test.json
* (add) tika-core/src/main/java/org/apache/tika/config/TikaProgressTracker.java
* (edit)
tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-crashing-detector.json
* (edit)
tika-server/tika-server-core/src/test/resources/configs/tika-config-server-pipes-basic.json
* (edit) docs/modules/ROOT/pages/pipes/index.adoc
* (edit)
tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
* (edit)
tika-server/tika-server-core/src/test/resources/configs/tika-config-server-emitter.json
* (edit)
tika-serialization/src/test/java/org/apache/tika/config/AllLimitsTest.java
* (edit)
tika-server/tika-server-core/src/test/resources/configs/tika-config-server-fetchers-emitters.json
* (edit) tika-serialization/src/test/resources/configs/test-partial-config.json
* (edit)
tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
* (edit)
tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
* (edit)
tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/protocol/PipesMessage.java
* (edit) tika-serialization/src/test/resources/configs/all-limits-test.json
* (edit)
tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-bad-jvm-args.json
* (edit)
tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
* (edit)
tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java
* (add)
tika-core/src/test/java/org/apache/tika/config/TikaProgressTrackerTest.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
* (edit)
tika-server/tika-server-core/src/test/resources/configs/cxf-unpack-test-template.json
* (edit)
tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
* (edit)
tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
* (edit)
tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java
* (edit)
tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
* (edit)
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-inline.json
* (edit)
tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
* (edit)
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-rendered.json
* (edit)
tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-write-limiter.json
* (edit)
tika-server/tika-server-core/src/test/resources/configs/tika-config-with-timeout.json
* (edit)
tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json
* (edit)
tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
* (edit)
tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
* (edit) docs/modules/ROOT/nav.adoc
* (edit)
tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ConnectionHandler.java
* (edit)
tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
* (edit)
tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesClient.java
* (edit)
tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json
* (edit)
tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-timeout-lt-heartbeat.json
* (edit)
tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json
* (edit)
tika-pipes/tika-async-cli/src/test/resources/configs/config-template.json
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
* (edit)
tika-server/tika-server-core/src/test/resources/configs/tika-config-server-tls-one-way-template.json
* (edit)
tika-server/tika-server-core/src/test/resources/configs/tika-config-server.json
* (edit)
tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.json
* (edit) tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
* (edit)
tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-shared-server.json
* (edit)
tika-serialization/src/test/java/org/apache/tika/serialization/RoundTripSerializationTest.java
* (edit)
tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json
* (edit)
tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/protocol/PipesMessageTest.java
* (edit)
tika-server/tika-server-standard/src/test/resources/configs/tika-config-json.json
* (delete) tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java
* (edit)
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
* (edit)
tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java
* (edit)
tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
* (edit) tika-app/src/test/resources/configs/config-template.json
* (edit)
tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/main/java/org/apache/tika/parser/ocr/tess4j/Tess4JParser.java
* (edit)
tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
* (add) docs/modules/ROOT/pages/pipes/timeouts.adoc
* (edit)
tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
* (edit)
tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesConfig.java
* (edit)
tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java
* (edit)
tika-server/tika-server-core/src/test/resources/configs/tika-config-server-tls.json
* (edit)
tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
* (edit)
tika-server/tika-server-core/src/test/resources/configs/tika-config-timeout-100ms.json
* (edit)
tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/config/ConfigOverrides.java
* (edit)
tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json
* (edit)
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java
* (edit) tika-grpc/src/test/resources/tika-pipes-test-config.json
* (edit)
tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
* (edit)
tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-bad-java-path.json
* (edit)
tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json
* (edit)
tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
* (edit)
tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
* (edit) tika-e2e-tests/tika-grpc/sample-configs/ner/tika-config.json
* (edit)
tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-bad-class.json
* (edit)
tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-emit-all.json
* (edit)
tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java
* (edit) tika-core/src/main/java/org/apache/tika/config/TimeoutLimits.java
* (edit)
tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
* (edit)
tika-server/tika-server-core/src/test/resources/configs/tika-config-server-basic.json
* (edit)
tika-serialization/src/test/java/org/apache/tika/config/TimeoutLimitsTest.java
* (edit)
tika-server/tika-server-core/src/test/resources/configs/tika-config-server-fetcher-template.json
* (edit) tika-serialization/src/test/resources/configs/test-config-loader.json
> Add a progress timeout feature
> ------------------------------
>
> Key: TIKA-4674
> URL: https://issues.apache.org/jira/browse/TIKA-4674
> Project: Tika
> Issue Type: New Feature
> Reporter: Tim Allison
> Priority: Major
>
> When processing a 100 page pdf that requires OCR, we want to allow a LOT of
> time, but we also don't want to allow a lot of time for some file that
> triggers an infinite loop in a parser.
> I propose adding a progress timeout feature that will be enforced in
> tika-pipes. We'll update the progress counter in ocr parsers and anywhere
> else where we expect processing to take a while.
> TotalTaskTimeout will still be operative.
> So, one scenario would be totaltasktimeout is an hour, with progress timeout
> set for 2 minutes. If a call to tesseract takes more than 2 minutes, then the
> job is stopped. Or if a rogue parser goes for longer than 2 minutes (and the
> progress counter is not in the loop where it is going rogue!), then that will
> timeout in 2 minutes.
> We could then get rid of timeouts on the external parsers, and then have them
> read these global timeouts, with a focus on the progress timeout.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)