This is an automated email from the ASF dual-hosted git repository. tilman pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 7223eebaf84138049a0fb1e2adbbb5ef2bbc8d68 Author: Tilman Hausherr <[email protected]> AuthorDate: Mon Oct 20 15:34:53 2025 +0200 TIKA-4525: migrate to aws v2 --- .../tika-pipes-iterator-s3/pom.xml | 8 +- .../pipes/pipesiterator/s3/S3PipesIterator.java | 101 +++++++++++++-------- 2 files changed, 68 insertions(+), 41 deletions(-) diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/pom.xml b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/pom.xml index 38fca8743..45267951e 100644 --- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/pom.xml +++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/pom.xml @@ -39,8 +39,12 @@ <scope>provided</scope> </dependency> <dependency> - <groupId>com.amazonaws</groupId> - <artifactId>aws-java-sdk-s3</artifactId> + <groupId>software.amazon.awssdk</groupId> + <artifactId>s3</artifactId> + </dependency> + <dependency> + <groupId>software.amazon.awssdk</groupId> + <artifactId>apache-client</artifactId> </dependency> <dependency> <groupId>org.apache.logging.log4j</groupId> diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java index fc5f6378c..622755e9f 100644 --- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java +++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java @@ -19,25 +19,32 @@ package org.apache.tika.pipes.pipesiterator.s3; import static org.apache.tika.config.TikaConfig.mustNotBeEmpty; import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.List; import java.util.Map; import java.util.concurrent.TimeoutException; import java.util.regex.Matcher; import java.util.regex.Pattern; -import com.amazonaws.AmazonClientException; -import com.amazonaws.ClientConfiguration; -import com.amazonaws.auth.AWSCredentialsProvider; -import com.amazonaws.auth.AWSStaticCredentialsProvider; -import com.amazonaws.auth.BasicAWSCredentials; -import com.amazonaws.auth.InstanceProfileCredentialsProvider; -import com.amazonaws.auth.profile.ProfileCredentialsProvider; -import com.amazonaws.client.builder.AwsClientBuilder; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3ClientBuilder; -import com.amazonaws.services.s3.iterable.S3Objects; -import com.amazonaws.services.s3.model.S3ObjectSummary; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; +import software.amazon.awssdk.auth.credentials.InstanceProfileCredentialsProvider; +import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider; +import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; +import software.amazon.awssdk.core.checksums.RequestChecksumCalculation; +import software.amazon.awssdk.core.exception.SdkClientException; +import software.amazon.awssdk.http.SdkHttpClient; +import software.amazon.awssdk.http.SdkHttpConfigurationOption; +import software.amazon.awssdk.http.apache.ApacheHttpClient; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.S3ClientBuilder; +import software.amazon.awssdk.services.s3.S3Configuration; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; +import software.amazon.awssdk.services.s3.model.S3Object; import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; @@ -66,10 +73,10 @@ public class S3PipesIterator extends PipesIterator implements Initializable { private String profile; private String bucket; private Pattern fileNamePattern = null; - private int maxConnections = ClientConfiguration.DEFAULT_MAX_CONNECTIONS; + private int maxConnections = SdkHttpConfigurationOption.GLOBAL_HTTP_DEFAULTS.get(SdkHttpConfigurationOption.MAX_CONNECTIONS); private boolean pathStyleAccessEnabled = false; - private AmazonS3 s3Client; + private S3Client s3Client; @Field public void setEndpointConfigurationService(String endpointConfigurationService) { @@ -136,7 +143,7 @@ public class S3PipesIterator extends PipesIterator implements Initializable { /** * This initializes the s3 client. Note, we wrap S3's RuntimeExceptions, - * e.g. AmazonClientException in a TikaConfigException. + * e.g. SdkClientException in a TikaConfigException. * * @param params params to use for initialization * @throws TikaConfigException @@ -144,31 +151,40 @@ public class S3PipesIterator extends PipesIterator implements Initializable { @Override public void initialize(Map<String, Param> params) throws TikaConfigException { //params have already been set...ignore them - AWSCredentialsProvider provider; - if (credentialsProvider.equals("instance")) { - provider = InstanceProfileCredentialsProvider.getInstance(); - } else if (credentialsProvider.equals("profile")) { - provider = new ProfileCredentialsProvider(profile); - } else if (credentialsProvider.equals("key_secret")) { - provider = new AWSStaticCredentialsProvider(new BasicAWSCredentials(accessKey, secretKey)); - } else { - throw new TikaConfigException("credentialsProvider must be set and " + "must be either 'instance', 'profile' or 'key_secret'"); + AwsCredentialsProvider provider; + switch (credentialsProvider) { + case "instance": + provider = InstanceProfileCredentialsProvider.builder().build(); + break; + case "profile": + provider = ProfileCredentialsProvider.builder().profileName(profile).build(); + break; + case "key_secret": + AwsBasicCredentials awsCreds = AwsBasicCredentials.create(accessKey, secretKey); + provider = StaticCredentialsProvider.create(awsCreds); + break; + default: + throw new TikaConfigException("credentialsProvider must be set and " + "must be either 'instance', 'profile' or 'key_secret'"); } - ClientConfiguration clientConfig = new ClientConfiguration().withMaxConnections(maxConnections); + SdkHttpClient httpClient = ApacheHttpClient.builder().maxConnections(maxConnections).build(); + S3Configuration clientConfig = S3Configuration.builder().pathStyleAccessEnabled(pathStyleAccessEnabled).build(); try { - AmazonS3ClientBuilder amazonS3ClientBuilder = AmazonS3ClientBuilder - .standard() - .withClientConfiguration(clientConfig) - .withCredentials(provider) - .withPathStyleAccessEnabled(pathStyleAccessEnabled); + S3ClientBuilder s3ClientBuilder = S3Client.builder().httpClient(httpClient). + requestChecksumCalculation(RequestChecksumCalculation.WHEN_REQUIRED). // https://stackoverflow.com/a/79488850/535646 + serviceConfiguration(clientConfig).credentialsProvider(provider); if (!StringUtils.isBlank(endpointConfigurationService)) { - amazonS3ClientBuilder.setEndpointConfiguration(new AwsClientBuilder.EndpointConfiguration(endpointConfigurationService, region)); + try { + s3ClientBuilder.endpointOverride(new URI(endpointConfigurationService)).region(Region.of(region)); + } + catch (URISyntaxException ex) { + throw new TikaConfigException("bad endpointConfigurationService: " + endpointConfigurationService, ex); + } } else { - amazonS3ClientBuilder.withRegion(region); + s3ClientBuilder.region(Region.of(region)); } - s3Client = amazonS3ClientBuilder.build(); - } catch (AmazonClientException e) { + s3Client = s3ClientBuilder.build(); + } catch (SdkClientException e) { throw new TikaConfigException("can't initialize s3 pipesiterator", e); } } @@ -187,20 +203,27 @@ public class S3PipesIterator extends PipesIterator implements Initializable { long start = System.currentTimeMillis(); int count = 0; HandlerConfig handlerConfig = getHandlerConfig(); - Matcher fileNameMatcher = null; + final Matcher fileNameMatcher; if (fileNamePattern != null) { fileNameMatcher = fileNamePattern.matcher(""); + } else { + fileNameMatcher = null; } - for (S3ObjectSummary summary : S3Objects.withPrefix(s3Client, bucket, prefix)) { - if (fileNameMatcher != null && !accept(fileNameMatcher, summary.getKey())) { + + ListObjectsV2Request listObjectsV2Request = ListObjectsV2Request.builder().bucket(bucket).prefix(prefix).build(); + List<S3Object> s3ObjectList = s3Client.listObjectsV2Paginator(listObjectsV2Request).stream(). + flatMap(resp -> resp.contents().stream()).toList(); + for (S3Object s3Object : s3ObjectList) { + String key = s3Object.key(); + if (fileNameMatcher != null && !accept(fileNameMatcher, key)) { continue; } long elapsed = System.currentTimeMillis() - start; - LOGGER.debug("adding ({}) {} in {} ms", count, summary.getKey(), elapsed); + LOGGER.debug("adding ({}) {} in {} ms", count, key, elapsed); //TODO -- allow user specified metadata as the "id"? ParseContext parseContext = new ParseContext(); parseContext.set(HandlerConfig.class, handlerConfig); - tryToAdd(new FetchEmitTuple(summary.getKey(), new FetchKey(fetcherName, summary.getKey()), new EmitKey(emitterName, summary.getKey()), new Metadata(), parseContext, + tryToAdd(new FetchEmitTuple(key, new FetchKey(fetcherName, key), new EmitKey(emitterName, key), new Metadata(), parseContext, getOnParseException())); count++; }
