[tika] branch main updated: TIKA-3403 Create example for Transcription (#444)

tallison Mon, 17 May 2021 13:48:48 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new d73ca6a  TIKA-3403 Create example for Transcription (#444)
d73ca6a is described below

commit d73ca6aa8458e23e9157dc344e5f782aa74d593b
Author: Lewis John McGibbney <[email protected]>
AuthorDate: Mon May 17 13:48:31 2021 -0700

    TIKA-3403 Create example for Transcription (#444)
    
    * TIKA-3403 Create example for Transcription
    
    * TIKA-3403 Create example for Transcription
    
    * TIKA-3403 Create example for Transcription
---
 tika-example/pom.xml                               |  28 +-
 .../tika/example/TranscribeTranslateExample.java   | 108 +++++++
 tika-parent/pom.xml                                |   2 +-
 tika-transcribe/pom.xml                            |   9 +
 .../apache/tika/transcribe/AmazonTranscribe.java   | 310 +++++++++++++++------
 .../tika/language/translate/GoogleTranslator.java  | 128 ++++-----
 6 files changed, 426 insertions(+), 159 deletions(-)

diff --git a/tika-example/pom.xml b/tika-example/pom.xml
index 3b10b91..f12304e 100644
--- a/tika-example/pom.xml
+++ b/tika-example/pom.xml
@@ -70,6 +70,25 @@
     </dependency>
     <dependency>
       <groupId>org.apache.tika</groupId>
+      <artifactId>tika-transcribe</artifactId>
+      <version>${project.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>commons-codec</groupId>
+          <artifactId>commons-codec</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-databind</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
       <artifactId>tika-core</artifactId>
       <version>${project.version}</version>
       <type>test-jar</type>
@@ -147,15 +166,6 @@
       <artifactId>junit</artifactId>
       <scope>test</scope>
     </dependency>
-    <!--
-    <dependency>
-      <groupId>org.apache.tika</groupId>
-      <artifactId>tika-test-resources</artifactId>
-      <version>${project.version}</version>
-      <type>test-jar</type>
-      <scope>test</scope>
-    </dependency>
-    -->
   </dependencies>
 
   <description>This module contains examples of how to use Apache 
Tika.</description>
diff --git 
a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
 
b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
new file mode 100644
index 0000000..12dd7e5
--- /dev/null
+++ 
b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+import java.io.FileInputStream;
+
+import org.apache.tika.language.translate.GoogleTranslator;
+import org.apache.tika.language.translate.Translator;
+import org.apache.tika.transcribe.AmazonTranscribe;
+import org.apache.tika.transcribe.Transcriber;
+
+/**
+ * This example demonstrates primitive logic for
+ * chaining Tika API calls. In this case translation
+ * could be considered as a downstream process to 
+ * transcription.
+ * We simply pass the output of
+ * a call to {@link Transcriber#transcribe(java.io.InputStream)}
+ * into {@link Translator#translate(String, String)}. 
+ * The {@link GoogleTranslator} is configured with a target 
+ * language of "en-US".
+ * @author lewismc
+ *
+ */
+public class TranscribeTranslateExample {
+
+    /**
+     * Use {@link GoogleTranslator} to execute translation on
+     * input data. This implementation needs configured as explained in the 
Javadoc.
+     * In this implementation, Google will try to guess the input language. 
The target 
+     * language is "en-US".
+     * @param text input text to translate.
+     * @return translated text String.
+     */
+    public static String googleTranslateToEnglish(String text) {
+        Translator translator = new GoogleTranslator();
+        String result = null;
+        if (translator.isAvailable()) {
+            try {
+                result = translator.translate(text, "en-US");
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+        }
+        return result;
+    }
+
+    /**
+     * Use {@link AmazonTranscribe} to execute transcription on input data.
+     * This implementation needs configured as explained in the Javadoc.
+     * @param file the name of the file (which needs to be on the Java 
Classpath) to transcribe.
+     * @return transcribed text.
+     */
+    public static String amazonTranscribe(String file) {
+        String filePath = 
TranscribeTranslateExample.class.getClassLoader().getResource(file).getPath();
+        String result = null;
+        Transcriber transcriber = new AmazonTranscribe();
+        if (transcriber.isAvailable()) {
+            try {
+                result = transcriber.transcribe(new FileInputStream(filePath));
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+        }
+        return result;
+    }
+
+    /**
+     * Main method to run this example. This program can be invoked as follows
+     * <ol>
+     * <li><code>transcribe-translate ${file}</code>; which executes both 
+     * transcription then translation on the given resource, or 
+     * <li><code>transcribe ${file}</code>; which executes only 
translation</li>
+     * @param args either of the commands described above and the input file 
+     * (which needs to be on the Java Classpath). 
+     */
+    public static void main (String[] args) {
+        String text = null;
+        if (args.length != 0) {
+            if ("transcribe-translate".equals(args[0])) {
+                text = googleTranslateToEnglish(amazonTranscribe(args[1]));
+                System.out.print("Transcription and translation 
successful!\nEXTRAXCTED TEXT: " + text);
+            } else if ("transcribe".equals(args[0])) {
+                text = amazonTranscribe(args[1]);
+                System.out.print("Transcription successful!\nEXTRAXCTED TEXT: 
" + text);
+            } else {
+                System.out.print("Incorrect invocation, see Javadoc.");
+            }
+        } else {
+            System.out.print("Incorrect invocation, see Javadoc.");
+        }
+    }
+}
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 917169d..3907ce9 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -279,7 +279,7 @@
     <rat.version>0.13</rat.version>
 
     <!-- dependency versions -->
-    <aws.version>1.11.937</aws.version>
+    <aws.version>1.11.1018</aws.version>
     <boilerpipe.version>1.1.0</boilerpipe.version>
     <!-- used by POI, PDFBox and Jackcess ...try to sync -->
     <bouncycastle.version>1.68</bouncycastle.version>
diff --git a/tika-transcribe/pom.xml b/tika-transcribe/pom.xml
index c15561c..aadb137 100644
--- a/tika-transcribe/pom.xml
+++ b/tika-transcribe/pom.xml
@@ -51,6 +51,10 @@
                     <groupId>commons-logging</groupId>
                     <artifactId>commons-logging</artifactId>
                 </exclusion>
+                <exclusion>
+                    <groupId>commons-codec</groupId>
+                    <artifactId>commons-codec</artifactId>
+                </exclusion>
             </exclusions>
         </dependency>
         <dependency>
@@ -58,6 +62,11 @@
             <artifactId>aws-java-sdk-s3</artifactId>
             <version>${aws.version}</version>
         </dependency>
+        <dependency>
+            <groupId>com.googlecode.json-simple</groupId>
+            <artifactId>json-simple</artifactId>
+            <version>${json.simple.version}</version>
+        </dependency>
         <!-- Test dependencies -->
         <dependency>
             <groupId>junit</groupId>
diff --git 
a/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
 
b/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
index c972fb1..5b50491 100644
--- 
a/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
+++ 
b/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
@@ -17,11 +17,28 @@
 
 package org.apache.tika.transcribe;
 
+import com.amazonaws.AmazonServiceException;
 import com.amazonaws.SdkClientException;
+import com.amazonaws.auth.AWSStaticCredentialsProvider;
+import com.amazonaws.auth.BasicAWSCredentials;
 import com.amazonaws.services.s3.AmazonS3;
+import com.amazonaws.services.s3.AmazonS3ClientBuilder;
+import com.amazonaws.services.s3.model.AmazonS3Exception;
+import com.amazonaws.services.s3.model.CompressionType;
+import com.amazonaws.services.s3.model.ExpressionType;
+import com.amazonaws.services.s3.model.InputSerialization;
+import com.amazonaws.services.s3.model.JSONInput;
+import com.amazonaws.services.s3.model.JSONOutput;
+import com.amazonaws.services.s3.model.JSONType;
+import com.amazonaws.services.s3.model.OutputSerialization;
 import com.amazonaws.services.s3.model.PutObjectRequest;
 import com.amazonaws.services.s3.model.PutObjectResult;
+import com.amazonaws.services.s3.model.SelectObjectContentEvent;
+import com.amazonaws.services.s3.model.SelectObjectContentEventVisitor;
+import com.amazonaws.services.s3.model.SelectObjectContentRequest;
+import com.amazonaws.services.s3.model.SelectObjectContentResult;
 import com.amazonaws.services.transcribe.AmazonTranscribeAsync;
+import com.amazonaws.services.transcribe.AmazonTranscribeAsyncClientBuilder;
 import com.amazonaws.services.transcribe.model.Media;
 import com.amazonaws.services.transcribe.model.StartTranscriptionJobRequest;
 import com.amazonaws.services.transcribe.model.TranscriptionJob;
@@ -30,19 +47,25 @@ import 
com.amazonaws.services.transcribe.model.GetTranscriptionJobRequest;
 import com.amazonaws.services.transcribe.model.GetTranscriptionJobResult;
 import com.amazonaws.services.transcribe.model.LanguageCode;
 import org.apache.tika.exception.TikaException;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.File;
+import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.FileOutputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
 import java.util.Properties;
 import java.util.UUID;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.stream.Collectors;
 
 /**
- * Wrapper class to access the AWS transcription service.
+ * <a href="https://aws.amazon.com/transcribe/";>Amazon Transcribe</a> 
+ * {@link Transcriber} implementation. See Javadoc for configiration options.
  *
  * @since Tika 2.1
  */
@@ -55,32 +78,61 @@ public class AmazonTranscribe implements Transcriber {
     public static final String DEFAULT_SECRET = "dummy-secret";
     public static final String DEFAULT_BUCKET = "dummy-bucket";
     public static final String BUCKET_NAME = "transcribe.BUCKET_NAME";
-    private static final Logger LOG = 
LoggerFactory.getLogger(AmazonTranscribe.class);
-    private AmazonTranscribeAsync amazonTranscribe;
+    public static final String REGION = "transcribe.REGION";
+    private static final Logger LOG = LoggerFactory
+            .getLogger(AmazonTranscribe.class);
+    private AmazonTranscribeAsync amazonTranscribeAsync;
     private AmazonS3 amazonS3;
     private String bucketName;
-    private boolean isAvailable; // Flag for whether or not transcription is 
available.
+    private String region;
+    private boolean isAvailable; // Flag for whether or not transcription is
+    // available.
     private String clientId;
-    private String clientSecret;  // Keys used for the API calls.
+    private String clientSecret; // Keys used for the API calls.
+    private AWSStaticCredentialsProvider credsProvider;
 
     /**
-     * Create a new AmazonTranscriber with the client keys specified in
-     * resources/org/apache/tika/transcribe/transcribe.amazon.properties.
+     * Create a new AmazonTranscribe instance with the client keys specified in
+     * <code>transcribe.amazon.properties</code> which needs to be available on
+     * the Java Classpath.
      * Silently becomes unavailable when client keys are unavailable.
-     * transcribe.AWS_ACCESS_KEY, transcribe.AWS_SECRET_KEY, and 
transcribe.BUCKET_NAME must be set in transcribe.amazon.properties for 
transcription to work.
+     * <code>transcribe.AWS_ACCESS_KEY</code>,
+     * <code>transcribe.AWS_SECRET_KEY</code>,
+     * <code>transcribe.BUCKET_NAME</code> and 
+     * <code>transcribe.REGION</code> must be set in
+     * <code>transcribe.amazon.properties</code>.
+     * <b>N.B.</b> it is not necessary to create the bucket before hand. 
+     * This implementation will automatically create the bucket if one
+     * does not alrerady exist, per the name defined above.
      *
-     * @since Tika 2.1
+     * @since Tika 2.0
      */
     public AmazonTranscribe() {
         Properties config = new Properties();
         try {
             config.load(AmazonTranscribe.class
-                    .getResourceAsStream(
-                            PROPERTIES_FILE));
+                    .getResourceAsStream(PROPERTIES_FILE));
             this.clientId = config.getProperty(ID_PROPERTY);
             this.clientSecret = config.getProperty(SECRET_PROPERTY);
             this.bucketName = config.getProperty(BUCKET_NAME);
+            this.region = config.getProperty(REGION);
+            BasicAWSCredentials creds = new BasicAWSCredentials(this.clientId,
+                    this.clientSecret);
+            this.credsProvider = new AWSStaticCredentialsProvider(creds);
+            amazonS3 = AmazonS3ClientBuilder.standard()
+                    .withCredentials(credsProvider).withRegion(this.region)
+                    .build();
             this.isAvailable = checkAvailable();
+            if (!this.amazonS3.doesBucketExistV2(this.bucketName)) {
+                try {
+                    amazonS3.createBucket(this.bucketName);
+                } catch (AmazonS3Exception e) {
+                    throw new RuntimeException(e.getErrorMessage());
+                }
+            }
+            this.amazonTranscribeAsync = AmazonTranscribeAsyncClientBuilder
+                    .standard().withCredentials(credsProvider)
+                    .withRegion(this.region).build();
         } catch (Exception e) {
             LOG.warn("Exception reading config file", e);
             isAvailable = false;
@@ -97,18 +149,21 @@ public class AmazonTranscribe implements Transcriber {
     }
 
     /**
-     * Constructs a new
-     * {@link PutObjectRequest} object to upload a file to the
-     * specified bucket and jobName. After constructing the request,
-     * users may optionally specify object metadata or a canned ACL as well.
+     * Constructs a new {@link PutObjectRequest} object to upload a file to the
+     * specified bucket and jobName. After constructing the request, users may
+     * optionally specify object metadata or a canned ACL as well.
      *
-     * @param file    The file to upload to Amazon S3.
-     * @param jobName The unique job name for each job(UUID).
+     * @param inputStream, null
+     *            The file to upload to Amazon S3.
+     * @param jobName
+     *            The unique job name for each job(UUID).
      */
-    private void uploadFileToBucket(File file, String jobName) throws 
TikaException {
-        PutObjectRequest request = new PutObjectRequest(this.bucketName, 
jobName, file);
+    private void uploadFileToBucket(InputStream inputStream, String jobName)
+            throws TikaException {
+        PutObjectRequest request = new PutObjectRequest(this.bucketName,
+                jobName, inputStream, null);
         try {
-            //  Block of code to try
+            @SuppressWarnings("unused")
             PutObjectResult response = amazonS3.putObject(request);
         } catch (SdkClientException e) {
             throw (new TikaException("File Upload to AWS Failed"));
@@ -118,68 +173,76 @@ public class AmazonTranscribe implements Transcriber {
     /**
      * Starts AWS Transcribe Job without language specification.
      *
-     * @param inputStream the source input stream.
+     * @param inputStream
+     *            the source input stream.
      * @return The transcribed string result, NULL if the job failed.
-     * @throws TikaException When there is an error transcribing.
-     * @throws IOException   If an I/O exception of some sort has occurred.
+     * @throws TikaException
+     *             When there is an error transcribing.
+     * @throws IOException
+     *             If an I/O exception of some sort has occurred.
      */
     @Override
-    public String transcribe(InputStream inputStream) throws TikaException, 
IOException {
-        if (!isAvailable()) return null;
+    public String transcribe(InputStream inputStream)
+            throws TikaException, IOException {
+        if (!isAvailable())
+            return null;
         String jobName = getJobKey();
-        byte[] buffer = new byte[inputStream.available()];
-        inputStream.read(buffer);
-        File targetFile = new File("src/main/resources/targetFile.tmp");
-        try (OutputStream outStream = new FileOutputStream(targetFile)) {
-            outStream.write(buffer);
-        }
-        targetFile.deleteOnExit();
-        uploadFileToBucket(targetFile, jobName);
+        uploadFileToBucket(inputStream, jobName);
         StartTranscriptionJobRequest startTranscriptionJobRequest = new 
StartTranscriptionJobRequest();
         Media media = new Media();
         media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString());
-        startTranscriptionJobRequest.withMedia(media)
-                .withOutputBucketName(this.bucketName)
-                .setTranscriptionJobName(jobName);
-        amazonTranscribe.startTranscriptionJob(startTranscriptionJobRequest);
-        return getTranscriptResult(jobName);
+        
startTranscriptionJobRequest.withIdentifyLanguage(true).withMedia(media)
+        .withOutputBucketName(this.bucketName)
+        .withTranscriptionJobName(jobName)
+        .setRequestCredentialsProvider(credsProvider);
+        amazonTranscribeAsync
+        .startTranscriptionJob(startTranscriptionJobRequest);
+        return getTranscriptText(jobName);
     }
 
     /**
      * Starts AWS Transcribe Job with language specification.
      *
-     * @param inputStream    the source input stream.
-     * @param sourceLanguage <a 
href="https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html";>AWS
 Language Code</a> for the language used in the input media file.
+     * @param inputStream
+     *            the source input stream.
+     * @param sourceLanguage
+     *            <a href=
+     *            
"https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html";>AWS
+     *            Language Code</a> for the language used in the input media
+     *            file.
      * @return The transcribed string result, NULL if the job failed.
-     * @throws TikaException When there is an error transcribing.
-     * @throws IOException   If an I/O exception of some sort has occurred.
-     * @see <a 
href="https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html";>AWS
 Language Code</a>
+     * @throws TikaException
+     *             When there is an error transcribing.
+     * @throws IOException
+     *             If an I/O exception of some sort has occurred.
+     * @see <a href=
+     *      
"https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html";>AWS
+     *      Language Code</a>
      */
     @Override
-    public String transcribe(InputStream inputStream, String sourceLanguage) 
throws TikaException, IOException {
-        if (!isAvailable()) return null;
+    public String transcribe(InputStream inputStream, String sourceLanguage)
+            throws TikaException, IOException {
+        if (!isAvailable())
+            return null;
         String jobName = getJobKey();
-        byte[] buffer = new byte[inputStream.available()];
-        inputStream.read(buffer);
-        File targetFile = new File("src/main/resources/targetFile.tmp");
-        try (OutputStream outStream = new FileOutputStream(targetFile)) {
-            outStream.write(buffer);
-        }
-        targetFile.deleteOnExit();
-        uploadFileToBucket(targetFile, jobName);
+        uploadFileToBucket(inputStream, jobName);
         StartTranscriptionJobRequest startTranscriptionJobRequest = new 
StartTranscriptionJobRequest();
         Media media = new Media();
         media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString());
-        startTranscriptionJobRequest.withMedia(media)
-                .withLanguageCode(LanguageCode.fromValue(sourceLanguage))
-                .withOutputBucketName(this.bucketName)
-                .setTranscriptionJobName(jobName);
-        amazonTranscribe.startTranscriptionJob(startTranscriptionJobRequest);
-        return getTranscriptResult(jobName);
+        ((StartTranscriptionJobRequest) startTranscriptionJobRequest
+                .withMedia(media).withOutputBucketName(this.bucketName)
+                .withTranscriptionJobName(jobName)
+                .withRequestCredentialsProvider(credsProvider))
+        .withLanguageCode(
+                LanguageCode.fromValue(sourceLanguage));
+        amazonTranscribeAsync
+        .startTranscriptionJob(startTranscriptionJobRequest);
+        return getTranscriptText(jobName);
     }
 
     /**
-     * @return true if this Transcriber is probably able to transcribe right 
now.
+     * @return true if this Transcriber is probably able to transcribe right
+     *         now.
      * @since Tika 2.1
      */
     @Override
@@ -190,7 +253,8 @@ public class AmazonTranscribe implements Transcriber {
     /**
      * Sets the client Id for the transcriber API.
      *
-     * @param id The ID to set.
+     * @param id
+     *            The ID to set.
      */
     public void setId(String id) {
         this.clientId = id;
@@ -200,7 +264,8 @@ public class AmazonTranscribe implements Transcriber {
     /**
      * Sets the client secret for the transcriber API.
      *
-     * @param secret The secret to set.
+     * @param secret
+     *            The secret to set.
      */
     public void setSecret(String secret) {
         this.clientSecret = secret;
@@ -210,7 +275,8 @@ public class AmazonTranscribe implements Transcriber {
     /**
      * Sets the client secret for the transcriber API.
      *
-     * @param bucket The bucket to set.
+     * @param bucket
+     *            The bucket to set.
      */
     public void setBucket(String bucket) {
         this.bucketName = bucket;
@@ -223,44 +289,118 @@ public class AmazonTranscribe implements Transcriber {
      * @return if the service is available
      */
     private boolean checkAvailable() {
-        return clientId != null &&
-                !clientId.equals(DEFAULT_ID) &&
-                clientSecret != null &&
-                !clientSecret.equals(DEFAULT_SECRET) &&
-                bucketName != null &&
-                !bucketName.equals(DEFAULT_BUCKET);
+        return clientId != null && !clientId.equals(DEFAULT_ID)
+                && clientSecret != null && !clientSecret.equals(DEFAULT_SECRET)
+                && bucketName != null && !bucketName.equals(DEFAULT_BUCKET);
     }
 
     /**
      * Gets Transcription result from AWS S3 bucket given the jobName.
      *
-     * @param fileNameS3 The path of the file to upload to Amazon S3.
+     * @param fileNameS3
+     *            The path of the file to upload to Amazon S3.
      * @return The transcribed string result, NULL if the job failed.
+     * @throws IOException possible reasons include (i) an End Event is not 
received
+     * from AWS S3 SelectObjectContentResult operation and (ii) a parse 
exception
+     * whilst processing JSON from the AWS S3 SelectObjectContentResult 
operation.
+     * @throws SdkClientException a AWS-specific exception related to 
SelectObjectContentResult
+     * operation.
+     * @throws AmazonServiceException possibly thrown if there is an issue 
selecting object content
+     * from AWS S3 objects.
      */
-    private String getTranscriptResult(String fileNameS3) {
-        TranscriptionJob transcriptionJob = 
retrieveObjectWhenJobCompleted(fileNameS3);
-        if (transcriptionJob != null && 
!TranscriptionJobStatus.FAILED.name().equals(transcriptionJob.getTranscriptionJobStatus()))
 {
-            return amazonS3.getObjectAsString(this.bucketName, fileNameS3);
-        } else
-            return null;
+    private String getTranscriptText(String fileNameS3) throws 
AmazonServiceException, SdkClientException, IOException {
+        TranscriptionJob transcriptionJob = retrieveObjectWhenJobCompleted(
+                fileNameS3);
+        String text = null;
+        if (transcriptionJob != null && !TranscriptionJobStatus.FAILED.name()
+                .equals(transcriptionJob.getTranscriptionJobStatus())) {
+            InputSerialization inputSerialization = new 
InputSerialization().withJson(new JSONInput().withType(JSONType.DOCUMENT))
+                    .withCompressionType(CompressionType.NONE);
+            OutputSerialization outputSerialization = new 
OutputSerialization().withJson(new JSONOutput());
+            SelectObjectContentRequest request = new 
SelectObjectContentRequest()
+                    .withBucketName(this.bucketName).withKey(fileNameS3 + 
".json")
+                    .withExpression("Select 
s.results.transcripts[0].transcript from S3Object s")//WHERE transcript IS NOT 
MISSING
+                    
.withExpressionType(ExpressionType.SQL).withRequestCredentialsProvider(credsProvider);
+            request.setInputSerialization(inputSerialization);
+            request.setOutputSerialization(outputSerialization);
+
+            final AtomicBoolean isResultComplete = new AtomicBoolean(false);
+
+            try (SelectObjectContentResult result = amazonS3
+                    .selectObjectContent(request)) {
+                InputStream resultInputStream = result.getPayload()
+                        .getRecordsInputStream(
+                                new SelectObjectContentEventVisitor() {
+                                    @Override
+                                    public void visit(
+                                            
SelectObjectContentEvent.StatsEvent event) {
+                                        LOG.debug(
+                                                "Received Stats, Bytes 
Scanned: "
+                                                        + event.getDetails()
+                                                        .getBytesScanned()
+                                                        + " Bytes Processed: "
+                                                        + event.getDetails()
+                                                        .getBytesProcessed());
+                                    }
+
+                                    /*
+                                     * An End Event informs that the request 
has
+                                     * finished successfully.
+                                     */
+                                    @Override
+                                    public void visit(
+                                            SelectObjectContentEvent.EndEvent 
event) {
+                                        isResultComplete.set(true);
+                                        LOG.debug(
+                                                "Received End Event. Result is 
complete.");
+                                    }
+                                });
+                text = new BufferedReader(
+                        new InputStreamReader(resultInputStream, 
StandardCharsets.UTF_8))
+                        .lines()
+                        .collect(Collectors.joining("\n"));
+            }
+            /*
+             * The End Event indicates all matching records have been
+             * transmitted. If the End Event is not received, the results
+             * may be incomplete.
+             */
+            if (!isResultComplete.get()) {
+                throw new IOException(
+                        "S3 Select request was incomplete as End Event was not 
received.");
+            }
+        }
+        JSONParser parser = new JSONParser();
+        JSONObject obj = null;
+        try {
+            obj = (JSONObject) parser.parse(text);
+        } catch (ParseException e) {
+            throw new IOException(e.getMessage(), e);
+        }
+        return obj.get("transcript").toString();
     }
 
     /**
      * Private helper function to get object from s3.
      *
-     * @param jobName The unique job name for each job(UUID).
+     * @param jobName
+     *            The unique job name for each job(UUID).
      * @return TranscriptionJob object
      */
     private TranscriptionJob retrieveObjectWhenJobCompleted(String jobName) {
         GetTranscriptionJobRequest getTranscriptionJobRequest = new 
GetTranscriptionJobRequest();
+        getTranscriptionJobRequest
+        .withRequestCredentialsProvider(credsProvider);
         getTranscriptionJobRequest.setTranscriptionJobName(jobName);
         while (true) {
-            GetTranscriptionJobResult innerResult = 
amazonTranscribe.getTranscriptionJob(getTranscriptionJobRequest);
-            String status = 
innerResult.getTranscriptionJob().getTranscriptionJobStatus();
-            if (TranscriptionJobStatus.COMPLETED.name().equals(status) ||
-                    TranscriptionJobStatus.FAILED.name().equals(status)) {
+            GetTranscriptionJobResult innerResult = amazonTranscribeAsync
+                    .getTranscriptionJob(getTranscriptionJobRequest);
+            String status = innerResult.getTranscriptionJob()
+                    .getTranscriptionJobStatus();
+            if (TranscriptionJobStatus.COMPLETED.name().equals(status)
+                    || TranscriptionJobStatus.FAILED.name().equals(status)) {
                 return innerResult.getTranscriptionJob();
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git 
a/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
 
b/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
index 98f8459..88202bb 100644
--- 
a/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
+++ 
b/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
@@ -47,69 +47,69 @@ import org.slf4j.LoggerFactory;
  */
 public class GoogleTranslator extends AbstractTranslator {
 
-       private static final Logger LOG = 
LoggerFactory.getLogger(GoogleTranslator.class);
-
-       private static final String GOOGLE_TRANSLATE_URL_BASE = 
"https://www.googleapis.com/language/translate/v2";;
-
-       private static final String DEFAULT_KEY = "dummy-secret";
-
-       private WebClient client;
-
-       private String apiKey;
-
-       private boolean isAvailable;
-
-       public GoogleTranslator() {
-               this.client = WebClient.create(GOOGLE_TRANSLATE_URL_BASE);
-               this.isAvailable = true;
-               Properties config = new Properties();
-               try {
-                       config.load(GoogleTranslator.class
-                                       .getResourceAsStream(
-                                                       
"translator.google.properties"));
-                       this.apiKey = 
config.getProperty("translator.client-secret");
-                       if (this.apiKey.equals(DEFAULT_KEY))
-                               this.isAvailable = false;
-               } catch (Exception e) {
-                       LOG.warn("Exception reading config file", e);
-                       isAvailable = false;
-               }
-       }
-
-       @Override
-       public String translate(String text, String sourceLanguage,
-                       String targetLanguage) throws TikaException, 
IOException {
-               if (!this.isAvailable)
-                       return text;
-               Response response = client.accept(MediaType.APPLICATION_JSON)
-                               .query("key", apiKey).query("source", 
sourceLanguage)
-                               .query("target", targetLanguage).query("q", 
text).get();
-               BufferedReader reader = new BufferedReader(new 
InputStreamReader(
-                               (InputStream) response.getEntity(), UTF_8));
-               String line = null;
-               StringBuffer responseText = new StringBuffer();
-               while ((line = reader.readLine()) != null) {
-                       responseText.append(line);
-               }
-
-               ObjectMapper mapper = new ObjectMapper();
-               JsonNode jsonResp = mapper.readTree(responseText.toString());
-               return jsonResp.findValuesAsText("translatedText").get(0);
-       }
-
-       @Override
-       public String translate(String text, String targetLanguage)
-                       throws TikaException, IOException {
-               if (!this.isAvailable)
-                       return text;
-               
-               String sourceLanguage = detectLanguage(text).getLanguage();
-               return translate(text, sourceLanguage, targetLanguage);
-       }
-
-       @Override
-       public boolean isAvailable() {
-               return this.isAvailable;
-       }
+    private static final Logger LOG = 
LoggerFactory.getLogger(GoogleTranslator.class);
+
+    private static final String GOOGLE_TRANSLATE_URL_BASE = 
"https://www.googleapis.com/language/translate/v2";;
+
+    private static final String DEFAULT_KEY = "dummy-secret";
+
+    private WebClient client;
+
+    private String apiKey;
+
+    private boolean isAvailable;
+
+    public GoogleTranslator() {
+        this.client = WebClient.create(GOOGLE_TRANSLATE_URL_BASE);
+        this.isAvailable = true;
+        Properties config = new Properties();
+        try {
+            config.load(GoogleTranslator.class
+                    .getResourceAsStream(
+                            "translator.google.properties"));
+            this.apiKey = config.getProperty("translator.client-secret");
+            if (this.apiKey.equals(DEFAULT_KEY))
+                this.isAvailable = false;
+        } catch (Exception e) {
+            LOG.warn("Exception reading config file", e);
+            isAvailable = false;
+        }
+    }
+
+    @Override
+    public String translate(String text, String sourceLanguage,
+            String targetLanguage) throws TikaException, IOException {
+        if (!this.isAvailable)
+            return text;
+        Response response = client.accept(MediaType.APPLICATION_JSON)
+                .query("key", apiKey).query("source", sourceLanguage)
+                .query("target", targetLanguage).query("q", text).get();
+        BufferedReader reader = new BufferedReader(new InputStreamReader(
+                (InputStream) response.getEntity(), UTF_8));
+        String line = null;
+        StringBuffer responseText = new StringBuffer();
+        while ((line = reader.readLine()) != null) {
+            responseText.append(line);
+        }
+
+        ObjectMapper mapper = new ObjectMapper();
+        JsonNode jsonResp = mapper.readTree(responseText.toString());
+        return jsonResp.findValuesAsText("translatedText").get(0);
+    }
+
+    @Override
+    public String translate(String text, String targetLanguage)
+            throws TikaException, IOException {
+        if (!this.isAvailable)
+            return text;
+
+        String sourceLanguage = detectLanguage(text).getLanguage();
+        return translate(text, sourceLanguage, targetLanguage);
+    }
+
+    @Override
+    public boolean isAvailable() {
+        return this.isAvailable;
+    }
 
 }

[tika] branch main updated: TIKA-3403 Create example for Transcription (#444)

Reply via email to