This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new d73ca6a TIKA-3403 Create example for Transcription (#444)
d73ca6a is described below
commit d73ca6aa8458e23e9157dc344e5f782aa74d593b
Author: Lewis John McGibbney <[email protected]>
AuthorDate: Mon May 17 13:48:31 2021 -0700
TIKA-3403 Create example for Transcription (#444)
* TIKA-3403 Create example for Transcription
* TIKA-3403 Create example for Transcription
* TIKA-3403 Create example for Transcription
---
tika-example/pom.xml | 28 +-
.../tika/example/TranscribeTranslateExample.java | 108 +++++++
tika-parent/pom.xml | 2 +-
tika-transcribe/pom.xml | 9 +
.../apache/tika/transcribe/AmazonTranscribe.java | 310 +++++++++++++++------
.../tika/language/translate/GoogleTranslator.java | 128 ++++-----
6 files changed, 426 insertions(+), 159 deletions(-)
diff --git a/tika-example/pom.xml b/tika-example/pom.xml
index 3b10b91..f12304e 100644
--- a/tika-example/pom.xml
+++ b/tika-example/pom.xml
@@ -70,6 +70,25 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
+ <artifactId>tika-transcribe</artifactId>
+ <version>${project.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-core</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
@@ -147,15 +166,6 @@
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
- <!--
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-test-resources</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- -->
</dependencies>
<description>This module contains examples of how to use Apache
Tika.</description>
diff --git
a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
new file mode 100644
index 0000000..12dd7e5
--- /dev/null
+++
b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+import java.io.FileInputStream;
+
+import org.apache.tika.language.translate.GoogleTranslator;
+import org.apache.tika.language.translate.Translator;
+import org.apache.tika.transcribe.AmazonTranscribe;
+import org.apache.tika.transcribe.Transcriber;
+
+/**
+ * This example demonstrates primitive logic for
+ * chaining Tika API calls. In this case translation
+ * could be considered as a downstream process to
+ * transcription.
+ * We simply pass the output of
+ * a call to {@link Transcriber#transcribe(java.io.InputStream)}
+ * into {@link Translator#translate(String, String)}.
+ * The {@link GoogleTranslator} is configured with a target
+ * language of "en-US".
+ * @author lewismc
+ *
+ */
+public class TranscribeTranslateExample {
+
+ /**
+ * Use {@link GoogleTranslator} to execute translation on
+ * input data. This implementation needs configured as explained in the
Javadoc.
+ * In this implementation, Google will try to guess the input language.
The target
+ * language is "en-US".
+ * @param text input text to translate.
+ * @return translated text String.
+ */
+ public static String googleTranslateToEnglish(String text) {
+ Translator translator = new GoogleTranslator();
+ String result = null;
+ if (translator.isAvailable()) {
+ try {
+ result = translator.translate(text, "en-US");
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Use {@link AmazonTranscribe} to execute transcription on input data.
+ * This implementation needs configured as explained in the Javadoc.
+ * @param file the name of the file (which needs to be on the Java
Classpath) to transcribe.
+ * @return transcribed text.
+ */
+ public static String amazonTranscribe(String file) {
+ String filePath =
TranscribeTranslateExample.class.getClassLoader().getResource(file).getPath();
+ String result = null;
+ Transcriber transcriber = new AmazonTranscribe();
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new FileInputStream(filePath));
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Main method to run this example. This program can be invoked as follows
+ * <ol>
+ * <li><code>transcribe-translate ${file}</code>; which executes both
+ * transcription then translation on the given resource, or
+ * <li><code>transcribe ${file}</code>; which executes only
translation</li>
+ * @param args either of the commands described above and the input file
+ * (which needs to be on the Java Classpath).
+ */
+ public static void main (String[] args) {
+ String text = null;
+ if (args.length != 0) {
+ if ("transcribe-translate".equals(args[0])) {
+ text = googleTranslateToEnglish(amazonTranscribe(args[1]));
+ System.out.print("Transcription and translation
successful!\nEXTRAXCTED TEXT: " + text);
+ } else if ("transcribe".equals(args[0])) {
+ text = amazonTranscribe(args[1]);
+ System.out.print("Transcription successful!\nEXTRAXCTED TEXT:
" + text);
+ } else {
+ System.out.print("Incorrect invocation, see Javadoc.");
+ }
+ } else {
+ System.out.print("Incorrect invocation, see Javadoc.");
+ }
+ }
+}
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 917169d..3907ce9 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -279,7 +279,7 @@
<rat.version>0.13</rat.version>
<!-- dependency versions -->
- <aws.version>1.11.937</aws.version>
+ <aws.version>1.11.1018</aws.version>
<boilerpipe.version>1.1.0</boilerpipe.version>
<!-- used by POI, PDFBox and Jackcess ...try to sync -->
<bouncycastle.version>1.68</bouncycastle.version>
diff --git a/tika-transcribe/pom.xml b/tika-transcribe/pom.xml
index c15561c..aadb137 100644
--- a/tika-transcribe/pom.xml
+++ b/tika-transcribe/pom.xml
@@ -51,6 +51,10 @@
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ </exclusion>
</exclusions>
</dependency>
<dependency>
@@ -58,6 +62,11 @@
<artifactId>aws-java-sdk-s3</artifactId>
<version>${aws.version}</version>
</dependency>
+ <dependency>
+ <groupId>com.googlecode.json-simple</groupId>
+ <artifactId>json-simple</artifactId>
+ <version>${json.simple.version}</version>
+ </dependency>
<!-- Test dependencies -->
<dependency>
<groupId>junit</groupId>
diff --git
a/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
b/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
index c972fb1..5b50491 100644
---
a/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
+++
b/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
@@ -17,11 +17,28 @@
package org.apache.tika.transcribe;
+import com.amazonaws.AmazonServiceException;
import com.amazonaws.SdkClientException;
+import com.amazonaws.auth.AWSStaticCredentialsProvider;
+import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3;
+import com.amazonaws.services.s3.AmazonS3ClientBuilder;
+import com.amazonaws.services.s3.model.AmazonS3Exception;
+import com.amazonaws.services.s3.model.CompressionType;
+import com.amazonaws.services.s3.model.ExpressionType;
+import com.amazonaws.services.s3.model.InputSerialization;
+import com.amazonaws.services.s3.model.JSONInput;
+import com.amazonaws.services.s3.model.JSONOutput;
+import com.amazonaws.services.s3.model.JSONType;
+import com.amazonaws.services.s3.model.OutputSerialization;
import com.amazonaws.services.s3.model.PutObjectRequest;
import com.amazonaws.services.s3.model.PutObjectResult;
+import com.amazonaws.services.s3.model.SelectObjectContentEvent;
+import com.amazonaws.services.s3.model.SelectObjectContentEventVisitor;
+import com.amazonaws.services.s3.model.SelectObjectContentRequest;
+import com.amazonaws.services.s3.model.SelectObjectContentResult;
import com.amazonaws.services.transcribe.AmazonTranscribeAsync;
+import com.amazonaws.services.transcribe.AmazonTranscribeAsyncClientBuilder;
import com.amazonaws.services.transcribe.model.Media;
import com.amazonaws.services.transcribe.model.StartTranscriptionJobRequest;
import com.amazonaws.services.transcribe.model.TranscriptionJob;
@@ -30,19 +47,25 @@ import
com.amazonaws.services.transcribe.model.GetTranscriptionJobRequest;
import com.amazonaws.services.transcribe.model.GetTranscriptionJobResult;
import com.amazonaws.services.transcribe.model.LanguageCode;
import org.apache.tika.exception.TikaException;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.File;
+import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.FileOutputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
import java.util.Properties;
import java.util.UUID;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.stream.Collectors;
/**
- * Wrapper class to access the AWS transcription service.
+ * <a href="https://aws.amazon.com/transcribe/">Amazon Transcribe</a>
+ * {@link Transcriber} implementation. See Javadoc for configiration options.
*
* @since Tika 2.1
*/
@@ -55,32 +78,61 @@ public class AmazonTranscribe implements Transcriber {
public static final String DEFAULT_SECRET = "dummy-secret";
public static final String DEFAULT_BUCKET = "dummy-bucket";
public static final String BUCKET_NAME = "transcribe.BUCKET_NAME";
- private static final Logger LOG =
LoggerFactory.getLogger(AmazonTranscribe.class);
- private AmazonTranscribeAsync amazonTranscribe;
+ public static final String REGION = "transcribe.REGION";
+ private static final Logger LOG = LoggerFactory
+ .getLogger(AmazonTranscribe.class);
+ private AmazonTranscribeAsync amazonTranscribeAsync;
private AmazonS3 amazonS3;
private String bucketName;
- private boolean isAvailable; // Flag for whether or not transcription is
available.
+ private String region;
+ private boolean isAvailable; // Flag for whether or not transcription is
+ // available.
private String clientId;
- private String clientSecret; // Keys used for the API calls.
+ private String clientSecret; // Keys used for the API calls.
+ private AWSStaticCredentialsProvider credsProvider;
/**
- * Create a new AmazonTranscriber with the client keys specified in
- * resources/org/apache/tika/transcribe/transcribe.amazon.properties.
+ * Create a new AmazonTranscribe instance with the client keys specified in
+ * <code>transcribe.amazon.properties</code> which needs to be available on
+ * the Java Classpath.
* Silently becomes unavailable when client keys are unavailable.
- * transcribe.AWS_ACCESS_KEY, transcribe.AWS_SECRET_KEY, and
transcribe.BUCKET_NAME must be set in transcribe.amazon.properties for
transcription to work.
+ * <code>transcribe.AWS_ACCESS_KEY</code>,
+ * <code>transcribe.AWS_SECRET_KEY</code>,
+ * <code>transcribe.BUCKET_NAME</code> and
+ * <code>transcribe.REGION</code> must be set in
+ * <code>transcribe.amazon.properties</code>.
+ * <b>N.B.</b> it is not necessary to create the bucket before hand.
+ * This implementation will automatically create the bucket if one
+ * does not alrerady exist, per the name defined above.
*
- * @since Tika 2.1
+ * @since Tika 2.0
*/
public AmazonTranscribe() {
Properties config = new Properties();
try {
config.load(AmazonTranscribe.class
- .getResourceAsStream(
- PROPERTIES_FILE));
+ .getResourceAsStream(PROPERTIES_FILE));
this.clientId = config.getProperty(ID_PROPERTY);
this.clientSecret = config.getProperty(SECRET_PROPERTY);
this.bucketName = config.getProperty(BUCKET_NAME);
+ this.region = config.getProperty(REGION);
+ BasicAWSCredentials creds = new BasicAWSCredentials(this.clientId,
+ this.clientSecret);
+ this.credsProvider = new AWSStaticCredentialsProvider(creds);
+ amazonS3 = AmazonS3ClientBuilder.standard()
+ .withCredentials(credsProvider).withRegion(this.region)
+ .build();
this.isAvailable = checkAvailable();
+ if (!this.amazonS3.doesBucketExistV2(this.bucketName)) {
+ try {
+ amazonS3.createBucket(this.bucketName);
+ } catch (AmazonS3Exception e) {
+ throw new RuntimeException(e.getErrorMessage());
+ }
+ }
+ this.amazonTranscribeAsync = AmazonTranscribeAsyncClientBuilder
+ .standard().withCredentials(credsProvider)
+ .withRegion(this.region).build();
} catch (Exception e) {
LOG.warn("Exception reading config file", e);
isAvailable = false;
@@ -97,18 +149,21 @@ public class AmazonTranscribe implements Transcriber {
}
/**
- * Constructs a new
- * {@link PutObjectRequest} object to upload a file to the
- * specified bucket and jobName. After constructing the request,
- * users may optionally specify object metadata or a canned ACL as well.
+ * Constructs a new {@link PutObjectRequest} object to upload a file to the
+ * specified bucket and jobName. After constructing the request, users may
+ * optionally specify object metadata or a canned ACL as well.
*
- * @param file The file to upload to Amazon S3.
- * @param jobName The unique job name for each job(UUID).
+ * @param inputStream, null
+ * The file to upload to Amazon S3.
+ * @param jobName
+ * The unique job name for each job(UUID).
*/
- private void uploadFileToBucket(File file, String jobName) throws
TikaException {
- PutObjectRequest request = new PutObjectRequest(this.bucketName,
jobName, file);
+ private void uploadFileToBucket(InputStream inputStream, String jobName)
+ throws TikaException {
+ PutObjectRequest request = new PutObjectRequest(this.bucketName,
+ jobName, inputStream, null);
try {
- // Block of code to try
+ @SuppressWarnings("unused")
PutObjectResult response = amazonS3.putObject(request);
} catch (SdkClientException e) {
throw (new TikaException("File Upload to AWS Failed"));
@@ -118,68 +173,76 @@ public class AmazonTranscribe implements Transcriber {
/**
* Starts AWS Transcribe Job without language specification.
*
- * @param inputStream the source input stream.
+ * @param inputStream
+ * the source input stream.
* @return The transcribed string result, NULL if the job failed.
- * @throws TikaException When there is an error transcribing.
- * @throws IOException If an I/O exception of some sort has occurred.
+ * @throws TikaException
+ * When there is an error transcribing.
+ * @throws IOException
+ * If an I/O exception of some sort has occurred.
*/
@Override
- public String transcribe(InputStream inputStream) throws TikaException,
IOException {
- if (!isAvailable()) return null;
+ public String transcribe(InputStream inputStream)
+ throws TikaException, IOException {
+ if (!isAvailable())
+ return null;
String jobName = getJobKey();
- byte[] buffer = new byte[inputStream.available()];
- inputStream.read(buffer);
- File targetFile = new File("src/main/resources/targetFile.tmp");
- try (OutputStream outStream = new FileOutputStream(targetFile)) {
- outStream.write(buffer);
- }
- targetFile.deleteOnExit();
- uploadFileToBucket(targetFile, jobName);
+ uploadFileToBucket(inputStream, jobName);
StartTranscriptionJobRequest startTranscriptionJobRequest = new
StartTranscriptionJobRequest();
Media media = new Media();
media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString());
- startTranscriptionJobRequest.withMedia(media)
- .withOutputBucketName(this.bucketName)
- .setTranscriptionJobName(jobName);
- amazonTranscribe.startTranscriptionJob(startTranscriptionJobRequest);
- return getTranscriptResult(jobName);
+
startTranscriptionJobRequest.withIdentifyLanguage(true).withMedia(media)
+ .withOutputBucketName(this.bucketName)
+ .withTranscriptionJobName(jobName)
+ .setRequestCredentialsProvider(credsProvider);
+ amazonTranscribeAsync
+ .startTranscriptionJob(startTranscriptionJobRequest);
+ return getTranscriptText(jobName);
}
/**
* Starts AWS Transcribe Job with language specification.
*
- * @param inputStream the source input stream.
- * @param sourceLanguage <a
href="https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS
Language Code</a> for the language used in the input media file.
+ * @param inputStream
+ * the source input stream.
+ * @param sourceLanguage
+ * <a href=
+ *
"https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS
+ * Language Code</a> for the language used in the input media
+ * file.
* @return The transcribed string result, NULL if the job failed.
- * @throws TikaException When there is an error transcribing.
- * @throws IOException If an I/O exception of some sort has occurred.
- * @see <a
href="https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS
Language Code</a>
+ * @throws TikaException
+ * When there is an error transcribing.
+ * @throws IOException
+ * If an I/O exception of some sort has occurred.
+ * @see <a href=
+ *
"https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS
+ * Language Code</a>
*/
@Override
- public String transcribe(InputStream inputStream, String sourceLanguage)
throws TikaException, IOException {
- if (!isAvailable()) return null;
+ public String transcribe(InputStream inputStream, String sourceLanguage)
+ throws TikaException, IOException {
+ if (!isAvailable())
+ return null;
String jobName = getJobKey();
- byte[] buffer = new byte[inputStream.available()];
- inputStream.read(buffer);
- File targetFile = new File("src/main/resources/targetFile.tmp");
- try (OutputStream outStream = new FileOutputStream(targetFile)) {
- outStream.write(buffer);
- }
- targetFile.deleteOnExit();
- uploadFileToBucket(targetFile, jobName);
+ uploadFileToBucket(inputStream, jobName);
StartTranscriptionJobRequest startTranscriptionJobRequest = new
StartTranscriptionJobRequest();
Media media = new Media();
media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString());
- startTranscriptionJobRequest.withMedia(media)
- .withLanguageCode(LanguageCode.fromValue(sourceLanguage))
- .withOutputBucketName(this.bucketName)
- .setTranscriptionJobName(jobName);
- amazonTranscribe.startTranscriptionJob(startTranscriptionJobRequest);
- return getTranscriptResult(jobName);
+ ((StartTranscriptionJobRequest) startTranscriptionJobRequest
+ .withMedia(media).withOutputBucketName(this.bucketName)
+ .withTranscriptionJobName(jobName)
+ .withRequestCredentialsProvider(credsProvider))
+ .withLanguageCode(
+ LanguageCode.fromValue(sourceLanguage));
+ amazonTranscribeAsync
+ .startTranscriptionJob(startTranscriptionJobRequest);
+ return getTranscriptText(jobName);
}
/**
- * @return true if this Transcriber is probably able to transcribe right
now.
+ * @return true if this Transcriber is probably able to transcribe right
+ * now.
* @since Tika 2.1
*/
@Override
@@ -190,7 +253,8 @@ public class AmazonTranscribe implements Transcriber {
/**
* Sets the client Id for the transcriber API.
*
- * @param id The ID to set.
+ * @param id
+ * The ID to set.
*/
public void setId(String id) {
this.clientId = id;
@@ -200,7 +264,8 @@ public class AmazonTranscribe implements Transcriber {
/**
* Sets the client secret for the transcriber API.
*
- * @param secret The secret to set.
+ * @param secret
+ * The secret to set.
*/
public void setSecret(String secret) {
this.clientSecret = secret;
@@ -210,7 +275,8 @@ public class AmazonTranscribe implements Transcriber {
/**
* Sets the client secret for the transcriber API.
*
- * @param bucket The bucket to set.
+ * @param bucket
+ * The bucket to set.
*/
public void setBucket(String bucket) {
this.bucketName = bucket;
@@ -223,44 +289,118 @@ public class AmazonTranscribe implements Transcriber {
* @return if the service is available
*/
private boolean checkAvailable() {
- return clientId != null &&
- !clientId.equals(DEFAULT_ID) &&
- clientSecret != null &&
- !clientSecret.equals(DEFAULT_SECRET) &&
- bucketName != null &&
- !bucketName.equals(DEFAULT_BUCKET);
+ return clientId != null && !clientId.equals(DEFAULT_ID)
+ && clientSecret != null && !clientSecret.equals(DEFAULT_SECRET)
+ && bucketName != null && !bucketName.equals(DEFAULT_BUCKET);
}
/**
* Gets Transcription result from AWS S3 bucket given the jobName.
*
- * @param fileNameS3 The path of the file to upload to Amazon S3.
+ * @param fileNameS3
+ * The path of the file to upload to Amazon S3.
* @return The transcribed string result, NULL if the job failed.
+ * @throws IOException possible reasons include (i) an End Event is not
received
+ * from AWS S3 SelectObjectContentResult operation and (ii) a parse
exception
+ * whilst processing JSON from the AWS S3 SelectObjectContentResult
operation.
+ * @throws SdkClientException a AWS-specific exception related to
SelectObjectContentResult
+ * operation.
+ * @throws AmazonServiceException possibly thrown if there is an issue
selecting object content
+ * from AWS S3 objects.
*/
- private String getTranscriptResult(String fileNameS3) {
- TranscriptionJob transcriptionJob =
retrieveObjectWhenJobCompleted(fileNameS3);
- if (transcriptionJob != null &&
!TranscriptionJobStatus.FAILED.name().equals(transcriptionJob.getTranscriptionJobStatus()))
{
- return amazonS3.getObjectAsString(this.bucketName, fileNameS3);
- } else
- return null;
+ private String getTranscriptText(String fileNameS3) throws
AmazonServiceException, SdkClientException, IOException {
+ TranscriptionJob transcriptionJob = retrieveObjectWhenJobCompleted(
+ fileNameS3);
+ String text = null;
+ if (transcriptionJob != null && !TranscriptionJobStatus.FAILED.name()
+ .equals(transcriptionJob.getTranscriptionJobStatus())) {
+ InputSerialization inputSerialization = new
InputSerialization().withJson(new JSONInput().withType(JSONType.DOCUMENT))
+ .withCompressionType(CompressionType.NONE);
+ OutputSerialization outputSerialization = new
OutputSerialization().withJson(new JSONOutput());
+ SelectObjectContentRequest request = new
SelectObjectContentRequest()
+ .withBucketName(this.bucketName).withKey(fileNameS3 +
".json")
+ .withExpression("Select
s.results.transcripts[0].transcript from S3Object s")//WHERE transcript IS NOT
MISSING
+
.withExpressionType(ExpressionType.SQL).withRequestCredentialsProvider(credsProvider);
+ request.setInputSerialization(inputSerialization);
+ request.setOutputSerialization(outputSerialization);
+
+ final AtomicBoolean isResultComplete = new AtomicBoolean(false);
+
+ try (SelectObjectContentResult result = amazonS3
+ .selectObjectContent(request)) {
+ InputStream resultInputStream = result.getPayload()
+ .getRecordsInputStream(
+ new SelectObjectContentEventVisitor() {
+ @Override
+ public void visit(
+
SelectObjectContentEvent.StatsEvent event) {
+ LOG.debug(
+ "Received Stats, Bytes
Scanned: "
+ + event.getDetails()
+ .getBytesScanned()
+ + " Bytes Processed: "
+ + event.getDetails()
+ .getBytesProcessed());
+ }
+
+ /*
+ * An End Event informs that the request
has
+ * finished successfully.
+ */
+ @Override
+ public void visit(
+ SelectObjectContentEvent.EndEvent
event) {
+ isResultComplete.set(true);
+ LOG.debug(
+ "Received End Event. Result is
complete.");
+ }
+ });
+ text = new BufferedReader(
+ new InputStreamReader(resultInputStream,
StandardCharsets.UTF_8))
+ .lines()
+ .collect(Collectors.joining("\n"));
+ }
+ /*
+ * The End Event indicates all matching records have been
+ * transmitted. If the End Event is not received, the results
+ * may be incomplete.
+ */
+ if (!isResultComplete.get()) {
+ throw new IOException(
+ "S3 Select request was incomplete as End Event was not
received.");
+ }
+ }
+ JSONParser parser = new JSONParser();
+ JSONObject obj = null;
+ try {
+ obj = (JSONObject) parser.parse(text);
+ } catch (ParseException e) {
+ throw new IOException(e.getMessage(), e);
+ }
+ return obj.get("transcript").toString();
}
/**
* Private helper function to get object from s3.
*
- * @param jobName The unique job name for each job(UUID).
+ * @param jobName
+ * The unique job name for each job(UUID).
* @return TranscriptionJob object
*/
private TranscriptionJob retrieveObjectWhenJobCompleted(String jobName) {
GetTranscriptionJobRequest getTranscriptionJobRequest = new
GetTranscriptionJobRequest();
+ getTranscriptionJobRequest
+ .withRequestCredentialsProvider(credsProvider);
getTranscriptionJobRequest.setTranscriptionJobName(jobName);
while (true) {
- GetTranscriptionJobResult innerResult =
amazonTranscribe.getTranscriptionJob(getTranscriptionJobRequest);
- String status =
innerResult.getTranscriptionJob().getTranscriptionJobStatus();
- if (TranscriptionJobStatus.COMPLETED.name().equals(status) ||
- TranscriptionJobStatus.FAILED.name().equals(status)) {
+ GetTranscriptionJobResult innerResult = amazonTranscribeAsync
+ .getTranscriptionJob(getTranscriptionJobRequest);
+ String status = innerResult.getTranscriptionJob()
+ .getTranscriptionJobStatus();
+ if (TranscriptionJobStatus.COMPLETED.name().equals(status)
+ || TranscriptionJobStatus.FAILED.name().equals(status)) {
return innerResult.getTranscriptionJob();
}
}
}
-}
+}
\ No newline at end of file
diff --git
a/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
b/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
index 98f8459..88202bb 100644
---
a/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
+++
b/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
@@ -47,69 +47,69 @@ import org.slf4j.LoggerFactory;
*/
public class GoogleTranslator extends AbstractTranslator {
- private static final Logger LOG =
LoggerFactory.getLogger(GoogleTranslator.class);
-
- private static final String GOOGLE_TRANSLATE_URL_BASE =
"https://www.googleapis.com/language/translate/v2";
-
- private static final String DEFAULT_KEY = "dummy-secret";
-
- private WebClient client;
-
- private String apiKey;
-
- private boolean isAvailable;
-
- public GoogleTranslator() {
- this.client = WebClient.create(GOOGLE_TRANSLATE_URL_BASE);
- this.isAvailable = true;
- Properties config = new Properties();
- try {
- config.load(GoogleTranslator.class
- .getResourceAsStream(
-
"translator.google.properties"));
- this.apiKey =
config.getProperty("translator.client-secret");
- if (this.apiKey.equals(DEFAULT_KEY))
- this.isAvailable = false;
- } catch (Exception e) {
- LOG.warn("Exception reading config file", e);
- isAvailable = false;
- }
- }
-
- @Override
- public String translate(String text, String sourceLanguage,
- String targetLanguage) throws TikaException,
IOException {
- if (!this.isAvailable)
- return text;
- Response response = client.accept(MediaType.APPLICATION_JSON)
- .query("key", apiKey).query("source",
sourceLanguage)
- .query("target", targetLanguage).query("q",
text).get();
- BufferedReader reader = new BufferedReader(new
InputStreamReader(
- (InputStream) response.getEntity(), UTF_8));
- String line = null;
- StringBuffer responseText = new StringBuffer();
- while ((line = reader.readLine()) != null) {
- responseText.append(line);
- }
-
- ObjectMapper mapper = new ObjectMapper();
- JsonNode jsonResp = mapper.readTree(responseText.toString());
- return jsonResp.findValuesAsText("translatedText").get(0);
- }
-
- @Override
- public String translate(String text, String targetLanguage)
- throws TikaException, IOException {
- if (!this.isAvailable)
- return text;
-
- String sourceLanguage = detectLanguage(text).getLanguage();
- return translate(text, sourceLanguage, targetLanguage);
- }
-
- @Override
- public boolean isAvailable() {
- return this.isAvailable;
- }
+ private static final Logger LOG =
LoggerFactory.getLogger(GoogleTranslator.class);
+
+ private static final String GOOGLE_TRANSLATE_URL_BASE =
"https://www.googleapis.com/language/translate/v2";
+
+ private static final String DEFAULT_KEY = "dummy-secret";
+
+ private WebClient client;
+
+ private String apiKey;
+
+ private boolean isAvailable;
+
+ public GoogleTranslator() {
+ this.client = WebClient.create(GOOGLE_TRANSLATE_URL_BASE);
+ this.isAvailable = true;
+ Properties config = new Properties();
+ try {
+ config.load(GoogleTranslator.class
+ .getResourceAsStream(
+ "translator.google.properties"));
+ this.apiKey = config.getProperty("translator.client-secret");
+ if (this.apiKey.equals(DEFAULT_KEY))
+ this.isAvailable = false;
+ } catch (Exception e) {
+ LOG.warn("Exception reading config file", e);
+ isAvailable = false;
+ }
+ }
+
+ @Override
+ public String translate(String text, String sourceLanguage,
+ String targetLanguage) throws TikaException, IOException {
+ if (!this.isAvailable)
+ return text;
+ Response response = client.accept(MediaType.APPLICATION_JSON)
+ .query("key", apiKey).query("source", sourceLanguage)
+ .query("target", targetLanguage).query("q", text).get();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ (InputStream) response.getEntity(), UTF_8));
+ String line = null;
+ StringBuffer responseText = new StringBuffer();
+ while ((line = reader.readLine()) != null) {
+ responseText.append(line);
+ }
+
+ ObjectMapper mapper = new ObjectMapper();
+ JsonNode jsonResp = mapper.readTree(responseText.toString());
+ return jsonResp.findValuesAsText("translatedText").get(0);
+ }
+
+ @Override
+ public String translate(String text, String targetLanguage)
+ throws TikaException, IOException {
+ if (!this.isAvailable)
+ return text;
+
+ String sourceLanguage = detectLanguage(text).getLanguage();
+ return translate(text, sourceLanguage, targetLanguage);
+ }
+
+ @Override
+ public boolean isAvailable() {
+ return this.isAvailable;
+ }
}