This is an automated email from the ASF dual-hosted git repository.
tilman pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 14001cb7b TIKA-4524: migrate to aws v2
14001cb7b is described below
commit 14001cb7b24742e0e785b4c86d2311358404ac34
Author: Tilman Hausherr <[email protected]>
AuthorDate: Sat Oct 18 10:44:11 2025 +0200
TIKA-4524: migrate to aws v2
---
.../tika-pipes-s3-integration-tests/pom.xml | 5 ++
.../tika/pipes/s3/tests/S3PipeIntegrationTest.java | 73 ++++++++++++++--------
2 files changed, 53 insertions(+), 25 deletions(-)
diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/pom.xml
b/tika-integration-tests/tika-pipes-s3-integration-tests/pom.xml
index 98a93b578..cdb8b4141 100644
--- a/tika-integration-tests/tika-pipes-s3-integration-tests/pom.xml
+++ b/tika-integration-tests/tika-pipes-s3-integration-tests/pom.xml
@@ -74,6 +74,11 @@
<artifactId>log4j-slf4j2-impl</artifactId>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>software.amazon.awssdk</groupId>
+ <artifactId>s3</artifactId>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
diff --git
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java
index d03b549ed..0a9a8a4a0 100644
---
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java
+++
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java
@@ -18,19 +18,18 @@ package org.apache.tika.pipes.s3.tests;
import java.io.File;
import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
import java.time.Duration;
import java.time.temporal.ChronoUnit;
+import java.util.Base64;
import java.util.HashSet;
import java.util.Set;
-import com.amazonaws.auth.AWSStaticCredentialsProvider;
-import com.amazonaws.auth.BasicAWSCredentials;
-import com.amazonaws.client.builder.AwsClientBuilder;
-import com.amazonaws.regions.Regions;
-import com.amazonaws.services.s3.AmazonS3;
-import com.amazonaws.services.s3.AmazonS3ClientBuilder;
-import com.amazonaws.services.s3.model.S3Object;
+import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.jetbrains.annotations.NotNull;
import org.junit.jupiter.api.AfterAll;
@@ -42,9 +41,20 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testcontainers.containers.DockerComposeContainer;
import org.testcontainers.junit.jupiter.Testcontainers;
-import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
import org.testcontainers.shaded.org.hamcrest.MatcherAssert;
import org.testcontainers.shaded.org.hamcrest.Matchers;
+import software.amazon.awssdk.auth.credentials.AwsBasicCredentials;
+import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider;
+import software.amazon.awssdk.core.ResponseInputStream;
+import software.amazon.awssdk.core.sync.RequestBody;
+import software.amazon.awssdk.regions.Region;
+import software.amazon.awssdk.services.s3.S3Client;
+import software.amazon.awssdk.services.s3.S3Configuration;
+import software.amazon.awssdk.services.s3.model.ChecksumAlgorithm;
+import software.amazon.awssdk.services.s3.model.CreateBucketRequest;
+import software.amazon.awssdk.services.s3.model.GetObjectRequest;
+import software.amazon.awssdk.services.s3.model.GetObjectResponse;
+import software.amazon.awssdk.services.s3.model.PutObjectRequest;
import org.apache.tika.cli.TikaCLI;
import org.apache.tika.pipes.HandlerConfig;
@@ -65,14 +75,14 @@ class S3PipeIntegrationTest {
private static final String FETCH_BUCKET = "fetch-bucket";
private static final String EMIT_BUCKET = "emit-bucket";
- private static final String REGION = Regions.US_EAST_1.getName();
+ private static final Region REGION = Region.US_EAST_1;
- private AmazonS3 s3Client;
+ private S3Client s3Client;
private final File testFileFolder = new File("target", "test-files");
private final Set<String> testFiles = new HashSet<>();
- private void createTestFiles() {
+ private void createTestFiles() throws NoSuchAlgorithmException {
if (testFileFolder.mkdirs()) {
LOG.info("Created test folder: {}", testFileFolder);
}
@@ -80,13 +90,24 @@ class S3PipeIntegrationTest {
for (int i = 0; i < numDocs; ++i) {
String nextFileName = "test-" + i + ".html";
testFiles.add(nextFileName);
- s3Client.putObject(FETCH_BUCKET, nextFileName,
- "<html><body>body-of-" + nextFileName + "</body></html>");
+ String s = "<html><body>body-of-" + nextFileName +
"</body></html>";
+ byte[] bytes = s.getBytes(StandardCharsets.US_ASCII);
+ // checksum must be done explicitely, or we get an exception:
+ // "The provided 'x-amz-content-sha256' header does not match what
was computed"
+ // https://github.com/minio/minio/issues/17662
+ // https://github.com/minio/minio/issues/20845
+ MessageDigest md = MessageDigest.getInstance("SHA256");
+ byte [] hash = md.digest(bytes);
+ String enc64 = Base64.getEncoder().encodeToString(hash);
+ PutObjectRequest request =
PutObjectRequest.builder().bucket(FETCH_BUCKET).key(nextFileName).
+
checksumAlgorithm(ChecksumAlgorithm.SHA256).checksumSHA256(enc64).build();
+ RequestBody requestBody = RequestBody.fromBytes(bytes);
+ s3Client.putObject(request, requestBody);
}
}
@BeforeAll
- void setupMinio() {
+ void setupMinio() throws URISyntaxException {
minioContainer.start();
initializeS3Client();
}
@@ -96,20 +117,21 @@ class S3PipeIntegrationTest {
minioContainer.close();
}
- private void initializeS3Client() {
- AwsClientBuilder.EndpointConfiguration endpoint =
- new AwsClientBuilder.EndpointConfiguration(MINIO_ENDPOINT,
REGION);
- s3Client = AmazonS3ClientBuilder.standard().withCredentials(
- new AWSStaticCredentialsProvider(new
BasicAWSCredentials(ACCESS_KEY, SECRET_KEY)))
-
.withEndpointConfiguration(endpoint).withPathStyleAccessEnabled(true).build();
+ private void initializeS3Client() throws URISyntaxException {
+ AwsBasicCredentials awsCreds = AwsBasicCredentials.create(ACCESS_KEY,
SECRET_KEY);
+ // https://github.com/aws/aws-sdk-java-v2/discussions/3536
+ StaticCredentialsProvider credentialsProvider =
StaticCredentialsProvider.create(awsCreds);
+ S3Configuration s3c =
S3Configuration.builder().pathStyleAccessEnabled(true).build(); // SO11228792
+ s3Client = S3Client.builder().serviceConfiguration(s3c).region(REGION).
+ credentialsProvider(credentialsProvider).endpointOverride(new
URI(MINIO_ENDPOINT)).build();
}
@Test
void s3PipelineIteratorS3FetcherAndS3Emitter() throws Exception {
// create s3 bucket for fetches and for emits
- s3Client.createBucket(FETCH_BUCKET);
- s3Client.createBucket(EMIT_BUCKET);
+
s3Client.createBucket(CreateBucketRequest.builder().bucket(FETCH_BUCKET).build());
+
s3Client.createBucket(CreateBucketRequest.builder().bucket(EMIT_BUCKET).build());
// create some test files and insert into fetch bucket
createTestFiles();
@@ -139,9 +161,10 @@ class S3PipeIntegrationTest {
}
for (String testFile : testFiles) {
- S3Object object = s3Client.getObject(EMIT_BUCKET, testFile +
".json");
+ GetObjectRequest objectRequest =
GetObjectRequest.builder().bucket(EMIT_BUCKET).key(testFile + ".json").build();
+ ResponseInputStream<GetObjectResponse> object =
s3Client.getObject(objectRequest);
Assertions.assertNotNull(object);
- String data = IOUtils.toString(object.getObjectContent(),
StandardCharsets.UTF_8);
+ String data = IOUtils.toString(object, StandardCharsets.UTF_8);
MatcherAssert.assertThat(
"Should be able to read the parsed body of the HTML file
as the body of the document",
data, Matchers.containsString("body-of-" + testFile));
@@ -159,6 +182,6 @@ class S3PipeIntegrationTest {
.replace("{EMIT_BUCKET}",
EMIT_BUCKET).replace("{FETCH_BUCKET}", FETCH_BUCKET)
.replace("{ACCESS_KEY}", ACCESS_KEY).replace("{SECRET_KEY}",
SECRET_KEY)
.replace("{ENDPOINT_CONFIGURATION_SERVICE}", MINIO_ENDPOINT)
- .replace("{REGION}", REGION);
+ .replace("{REGION}", REGION.id());
}
}