This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ca13b74df5cd38b58d40c2ceb4b5b4d6b88c1b77
Author: Tilman Hausherr <[email protected]>
AuthorDate: Sun Jun 8 13:36:42 2025 +0200

    TIKA-4435: delete files from bucket; replace deprecated SelectObject with 
json; add debugging
---
 .../parser/transcribe/aws/AmazonTranscribe.java    | 125 ++++++++-------------
 1 file changed, 44 insertions(+), 81 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java
 
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java
index fb20f2522..db5600232 100644
--- 
a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java
+++ 
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java
@@ -17,19 +17,14 @@
 
 package org.apache.tika.parser.transcribe.aws;
 
-import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
 import java.util.UUID;
-import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.stream.Collectors;
 
 import com.amazonaws.AmazonServiceException;
 import com.amazonaws.SdkClientException;
@@ -38,19 +33,15 @@ import com.amazonaws.auth.BasicAWSCredentials;
 import com.amazonaws.services.s3.AmazonS3;
 import com.amazonaws.services.s3.AmazonS3ClientBuilder;
 import com.amazonaws.services.s3.model.AmazonS3Exception;
-import com.amazonaws.services.s3.model.CompressionType;
-import com.amazonaws.services.s3.model.ExpressionType;
-import com.amazonaws.services.s3.model.InputSerialization;
-import com.amazonaws.services.s3.model.JSONInput;
-import com.amazonaws.services.s3.model.JSONOutput;
-import com.amazonaws.services.s3.model.JSONType;
-import com.amazonaws.services.s3.model.OutputSerialization;
+import com.amazonaws.services.s3.model.GetObjectRequest;
 import com.amazonaws.services.s3.model.PutObjectRequest;
 import com.amazonaws.services.s3.model.PutObjectResult;
-import com.amazonaws.services.s3.model.SelectObjectContentEvent;
-import com.amazonaws.services.s3.model.SelectObjectContentEventVisitor;
-import com.amazonaws.services.s3.model.SelectObjectContentRequest;
-import com.amazonaws.services.s3.model.SelectObjectContentResult;
+import com.amazonaws.services.s3.model.S3Object;
+import com.amazonaws.services.s3.model.S3ObjectInputStream;
+import com.amazonaws.services.securitytoken.AWSSecurityTokenService;
+import 
com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClientBuilder;
+import com.amazonaws.services.securitytoken.model.GetCallerIdentityRequest;
+import com.amazonaws.services.securitytoken.model.GetCallerIdentityResult;
 import com.amazonaws.services.transcribe.AmazonTranscribeAsync;
 import com.amazonaws.services.transcribe.AmazonTranscribeAsyncClientBuilder;
 import com.amazonaws.services.transcribe.model.GetTranscriptionJobRequest;
@@ -60,9 +51,8 @@ import com.amazonaws.services.transcribe.model.Media;
 import com.amazonaws.services.transcribe.model.StartTranscriptionJobRequest;
 import com.amazonaws.services.transcribe.model.TranscriptionJob;
 import com.amazonaws.services.transcribe.model.TranscriptionJobStatus;
-import org.json.simple.JSONObject;
-import org.json.simple.parser.JSONParser;
-import org.json.simple.parser.ParseException;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
@@ -100,7 +90,7 @@ public class AmazonTranscribe implements Parser, 
Initializable {
     private String bucketName;
     private String region;
     private boolean isAvailable; // Flag for whether or not transcription is 
available.
-    private String clientId;
+    private String clientId; // Access key
     private String clientSecret; // Keys used for the API calls.
     private AWSStaticCredentialsProvider credsProvider;
 
@@ -168,6 +158,7 @@ public class AmazonTranscribe implements Parser, 
Initializable {
         xhtml.endElement("p");
         xhtml.endDocument();
 
+        deleteFilesFromBucket(jobName);
     }
 
 
@@ -252,7 +243,16 @@ public class AmazonTranscribe implements Parser, 
Initializable {
         try {
             @SuppressWarnings("unused") PutObjectResult response = 
amazonS3.putObject(request);
         } catch (SdkClientException e) {
-            throw (new TikaException("File Upload to AWS Failed"));
+            throw new TikaException("File upload to AWS failed: " + 
e.getMessage(), e);
+        }
+    }
+
+    private void deleteFilesFromBucket(String jobName) throws TikaException {
+        try {
+            amazonS3.deleteObject(bucketName, jobName);
+            amazonS3.deleteObject(bucketName, jobName + ".json");
+        } catch (SdkClientException e) {
+            LOG.error("Failed to delete {} and/or {} from {}", jobName, 
jobName + ".json", bucketName, e);
         }
     }
 
@@ -272,69 +272,24 @@ public class AmazonTranscribe implements Parser, 
Initializable {
     private String getTranscriptText(String fileNameS3)
             throws AmazonServiceException, SdkClientException, IOException {
         TranscriptionJob transcriptionJob = 
retrieveObjectWhenJobCompleted(fileNameS3);
-        String text = null;
+        String text = "";
         if (transcriptionJob != null && !TranscriptionJobStatus.FAILED.name()
                 .equals(transcriptionJob.getTranscriptionJobStatus())) {
-            InputSerialization inputSerialization =
-                    new InputSerialization().withJson(new 
JSONInput().withType(JSONType.DOCUMENT))
-                            .withCompressionType(CompressionType.NONE);
-            OutputSerialization outputSerialization =
-                    new OutputSerialization().withJson(new JSONOutput());
-            SelectObjectContentRequest request =
-                    new 
SelectObjectContentRequest().withBucketName(this.bucketName)
-                            .withKey(fileNameS3 + ".json").withExpression(
-                            "Select s.results.transcripts[0].transcript from 
S3Object s")
-                            //WHERE transcript IS NOT MISSING
-                            .withExpressionType(ExpressionType.SQL)
-                            .withRequestCredentialsProvider(credsProvider);
-            request.setInputSerialization(inputSerialization);
-            request.setOutputSerialization(outputSerialization);
-
-            final AtomicBoolean isResultComplete = new AtomicBoolean(false);
-
-            try (SelectObjectContentResult result = 
amazonS3.selectObjectContent(request)) {
-                InputStream resultInputStream = result.getPayload()
-                        .getRecordsInputStream(new 
SelectObjectContentEventVisitor() {
-                            @Override
-                            public void 
visit(SelectObjectContentEvent.StatsEvent event) {
-                                LOG.debug("Received Stats, Bytes Scanned: " +
-                                        event.getDetails().getBytesScanned() +
-                                        " Bytes Processed: " +
-                                        
event.getDetails().getBytesProcessed());
-                            }
-
-                            /*
-                             * An End Event informs that the request has
-                             * finished successfully.
-                             */
-                            @Override
-                            public void 
visit(SelectObjectContentEvent.EndEvent event) {
-                                isResultComplete.set(true);
-                                LOG.debug("Received End Event. Result is 
complete.");
-                            }
-                        });
-                text = new BufferedReader(
-                        new InputStreamReader(resultInputStream, 
StandardCharsets.UTF_8)).lines()
-                        .collect(Collectors.joining("\n"));
-            }
-            /*
-             * The End Event indicates all matching records have been
-             * transmitted. If the End Event is not received, the results
-             * may be incomplete.
-             */
-            if (!isResultComplete.get()) {
-                throw new IOException(
-                        "S3 Select request was incomplete as End Event was not 
received.");
+            S3Object s3Object = amazonS3.getObject(new 
GetObjectRequest(bucketName, fileNameS3 + ".json"));
+            try (S3ObjectInputStream objectContent = 
s3Object.getObjectContent()) {
+                ObjectMapper mapper = new ObjectMapper();
+                JsonNode root = mapper.readTree(objectContent);
+                text = root
+                        .path("results")
+                        .path("transcripts")
+                        .get(0)
+                        .path("transcript")
+                        .asText();
+                // could also be done with json.simple:
+                // ((JSONObject)((JSONArray)((JSONObject) 
obj.get("results")).get("transcripts")).get(0)).get("transcript")
             }
         }
-        JSONParser parser = new JSONParser();
-        JSONObject obj = null;
-        try {
-            obj = (JSONObject) parser.parse(text);
-        } catch (ParseException e) {
-            throw new IOException(e.getMessage(), e);
-        }
-        return obj.get("transcript").toString();
+        return text;
     }
 
     /**
@@ -375,7 +330,15 @@ public class AmazonTranscribe implements Parser, 
Initializable {
                         
AmazonS3ClientBuilder.standard().withCredentials(credsProvider).build();
                 this.region = amazonS3.getRegionName(); // not sure if this 
works at all
             }
-            if (!this.amazonS3.doesBucketExistV2(this.bucketName)) {
+
+            // for debugging
+            AWSSecurityTokenService stsClient = 
AWSSecurityTokenServiceClientBuilder.standard()
+                    .withCredentials(credsProvider).withRegion(region)
+                    .build();
+            GetCallerIdentityResult identity = stsClient.getCallerIdentity(new 
GetCallerIdentityRequest());
+            LOG.debug("Authenticated as: {}", identity.getArn());
+
+            if (!this.amazonS3.doesBucketExistV2(this.bucketName)) { // 
returns true if no access
                 try {
                     amazonS3.createBucket(this.bucketName);
                 } catch (AmazonS3Exception e) {

Reply via email to