This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4519 in repository https://gitbox.apache.org/repos/asf/tika.git
commit d173308da61493a6514ac64c5688ff6209368324 Merge: b06877bce e15e9f22e Author: tallison <[email protected]> AuthorDate: Mon Oct 27 14:47:15 2025 -0400 Merge branch 'main' into TIKA-4519 # Conflicts: # tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java # tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java CHANGES.txt | 14 +-- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 4 +- .../test/java/org/apache/tika/cli/TikaCLITest.java | 31 ++++- tika-app/src/test/resources/test-data/testPST.pst | Bin 0 -> 2302976 bytes .../extractor/DefaultEmbeddedStreamTranslator.java | 21 ++-- .../tika/extractor/EmbeddedStreamTranslator.java | 8 +- .../apache/tika/extractor/RUnpackExtractor.java | 36 ++++-- .../java/org/apache/tika/io/FilenameUtils.java | 31 ++++- .../java/org/apache/tika/io/FilenameUtilsTest.java | 8 +- tika-detectors/tika-detector-magika/pom.xml | 36 ------ tika-detectors/tika-detector-siegfried/pom.xml | 36 ------ .../tika/pipes/kafka/tests/TikaPipesKafkaTest.java | 2 +- .../tika-pipes-s3-integration-tests/pom.xml | 5 + .../tika/pipes/s3/tests/PipeIntegrationTests.java | 49 +++++--- .../tika/pipes/s3/tests/S3PipeIntegrationTest.java | 77 +++++++------ .../src/test/resources/tika-config-s3ToFs.xml | 21 ++-- .../src/test/resources/tika-config-s3Tos3.xml | 23 ++-- .../pipes/solr/tests/TikaPipesSolrTestBase.java | 2 +- tika-parent/pom.xml | 66 ++++++----- tika-parsers/pom.xml | 2 +- .../tika-parser-scientific-package/pom.xml | 47 -------- .../tika-parser-sqlite3-package/pom.xml | 41 ------- tika-parsers/tika-parsers-ml/pom.xml | 2 + .../microsoft/MSEmbeddedStreamTranslator.java | 39 +++---- .../microsoft/PSTEmailStreamTranslator.java | 55 +++++++++ ....apache.tika.extractor.EmbeddedStreamTranslator | 3 +- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 3 + .../org/apache/tika/async/cli/TikaAsyncCLI.java | 2 +- .../tika/async/cli/TikaConfigAsyncWriter.java | 8 +- .../tika-emitters/tika-emitter-az-blob/pom.xml | 43 ------- tika-pipes/tika-emitters/tika-emitter-gcs/pom.xml | 43 ------- .../tika-emitters/tika-emitter-kafka/pom.xml | 43 ------- .../tika-emitters/tika-emitter-opensearch/pom.xml | 43 ------- tika-pipes/tika-emitters/tika-emitter-s3/pom.xml | 51 +-------- .../apache/tika/pipes/emitter/s3/S3Emitter.java | 111 +++++++++++------- tika-pipes/tika-emitters/tika-emitter-solr/pom.xml | 43 ------- .../tika-fetchers/tika-fetcher-az-blob/pom.xml | 43 ------- tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml | 43 ------- tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml | 42 ------- .../tika-fetcher-microsoft-graph/pom.xml | 45 +------- tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml | 55 ++------- .../apache/tika/pipes/fetcher/s3/S3Fetcher.java | 127 ++++++++++++--------- .../org/apache/tika/pipes/core/PipesClient.java | 2 +- .../AbstractEmbeddedDocumentBytesHandler.java | 37 +----- .../tika-pipes-iterator-az-blob/pom.xml | 43 ------- .../tika-pipes-iterator-csv/pom.xml | 43 ------- .../tika-pipes-iterator-gcs/pom.xml | 43 ------- .../tika-pipes-iterator-jdbc/pom.xml | 43 ------- .../tika-pipes-iterator-json/pom.xml | 43 ------- .../tika-pipes-iterator-kafka/pom.xml | 43 ------- .../tika-pipes-iterator-s3/pom.xml | 51 +-------- .../pipes/pipesiterator/s3/S3PipesIterator.java | 97 ++++++++++------ .../tika-pipes-iterator-solr/pom.xml | 43 ------- .../tika-pipes-reporter-fs-status/pom.xml | 43 ------- .../tika-pipes-reporter-jdbc/pom.xml | 43 ------- .../tika-pipes-reporter-opensearch/pom.xml | 43 ------- tika-server/tika-server-client/pom.xml | 47 -------- .../server/core/resource/TranslateResource.java | 50 +++++--- .../server/core/resource/UnpackerResource.java | 27 ++--- tika-translate/pom.xml | 2 +- 60 files changed, 599 insertions(+), 1548 deletions(-) diff --cc tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java index 3d9e25530,4a63046f6..a35a559fe --- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java +++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java @@@ -187,12 -201,19 +201,19 @@@ public class S3PipesIterator extends Pi long start = System.currentTimeMillis(); int count = 0; HandlerConfig handlerConfig = getHandlerConfig(); - Matcher fileNameMatcher = null; + final Matcher fileNameMatcher; if (fileNamePattern != null) { fileNameMatcher = fileNamePattern.matcher(""); + } else { + fileNameMatcher = null; } - for (S3ObjectSummary summary : S3Objects.withPrefix(s3Client, bucket, prefix)) { - if (fileNameMatcher != null && !accept(fileNameMatcher, summary.getKey())) { - ++ + ListObjectsV2Request listObjectsV2Request = ListObjectsV2Request.builder().bucket(bucket).prefix(prefix).build(); + List<S3Object> s3ObjectList = s3Client.listObjectsV2Paginator(listObjectsV2Request).stream(). + flatMap(resp -> resp.contents().stream()).toList(); + for (S3Object s3Object : s3ObjectList) { + String key = s3Object.key(); + if (fileNameMatcher != null && !accept(fileNameMatcher, key)) { continue; } long elapsed = System.currentTimeMillis() - start;
