This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 2c951a3 TIKA-3384 -- convert transcribe to a traditional parser
2c951a3 is described below
commit 2c951a35e57cf6624457798d51c1b8cbffff0f7b
Author: tallison <[email protected]>
AuthorDate: Tue May 18 07:44:18 2021 -0400
TIKA-3384 -- convert transcribe to a traditional parser
---
pom.xml | 1 -
.../org/apache/tika/transcribe/Transcriber.java | 60 ---
.../tika/example/TranscribeTranslateExample.java | 83 ++--
tika-parsers/tika-parsers-ml/pom.xml | 1 +
.../tika-parsers-ml/tika-transcribe-aws}/pom.xml | 47 +-
.../parser/transcribe/aws/AmazonTranscribe.java | 398 ++++++++++++++++
.../transcribe/aws/AmazonTranscribeTest.java | 310 ++++++++++++
.../test-documents}/ShortAudioSampleFrench.mp3 | Bin
.../de-DE_(We_Are_At_School_x2).mp3 | Bin
.../en-AU_(A_Little_Bottle_Of_Water).mp3 | Bin
.../en-GB_(A_Little_Bottle_Of_Water).mp3 | Bin
.../en-US_(A_Little_Bottle_Of_Water).mp3 | Bin
.../test/resources/test-documents}/en-US_(Hi).mp4 | Bin
.../it-IT_(We_Are_Having_Class_x2).mp3 | Bin
.../test-documents}/ja-JP_(We_Are_At_School).mp3 | Bin
.../test-documents}/ko-KR_(Annyeonghaseyo).mp4 | Bin
.../ko-KR_(We_Are_Having_Class_x2).mp3 | Bin
.../test-documents}/pt-BR_(We_Are_At_School).mp3 | Bin
.../test/resources/tika-config-transcribe-aws.xml | 32 ++
.../apache/tika/transcribe/AmazonTranscribe.java | 406 ----------------
.../org.apache.tika.language.translate.Translator | 16 -
.../transcribe.amazon.properties | 18 -
.../tika/transcribe/AmazonTranscribeTest.java | 527 ---------------------
23 files changed, 830 insertions(+), 1069 deletions(-)
diff --git a/pom.xml b/pom.xml
index f8c6591..d0e43d4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -52,7 +52,6 @@
<module>tika-translate</module>
<module>tika-example</module>
<module>tika-java7</module>
- <module>tika-transcribe</module>
</modules>
<profiles>
diff --git
a/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java
b/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java
deleted file mode 100644
index 3546256..0000000
--- a/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.transcribe;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-
-/**
- * Interface for Transcriber services.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-94">TIKA-94</a>
- * @since Tika 2.1
- */
-public interface Transcriber {
- /**
- * Transcribe the given file.
- *
- * @param inputStream the source input stream.
- * @return The transcribed string result, NULL if the job failed.
- * @throws TikaException When there is an error transcribing.
- * @throws IOException If an I/O exception of some sort has occurred.
- * @since 2.1
- */
- public String transcribe(InputStream inputStream) throws TikaException,
IOException;
-
- /**
- * Transcribe the given the file and the source language.
- *
- * @param inputStream the source input stream.
- * @param sourceLanguage The language code for the language used in the
input media file.
- * @return The transcribed string result, NULL if the job failed.
- * @throws TikaException When there is an error transcribing.
- * @throws IOException If an I/O exception of some sort has occurred.
- * @since 2.1
- */
- public String transcribe(InputStream inputStream, String sourceLanguage)
throws TikaException, IOException;
-
- /**
- * @return true if this Transcriber is probably able to transcribe right
now.
- * @since Tika 2.1
- */
- public boolean isAvailable();
-}
diff --git
a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
index 12dd7e5..a90d322 100644
---
a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
+++
b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
@@ -17,22 +17,23 @@
package org.apache.tika.example;
-import java.io.FileInputStream;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.language.translate.GoogleTranslator;
import org.apache.tika.language.translate.Translator;
-import org.apache.tika.transcribe.AmazonTranscribe;
-import org.apache.tika.transcribe.Transcriber;
/**
* This example demonstrates primitive logic for
* chaining Tika API calls. In this case translation
- * could be considered as a downstream process to
+ * could be considered as a downstream process to
* transcription.
* We simply pass the output of
- * a call to {@link Transcriber#transcribe(java.io.InputStream)}
- * into {@link Translator#translate(String, String)}.
- * The {@link GoogleTranslator} is configured with a target
+ * a call to {@link Tika#parseToString(Path)}
+ * into {@link Translator#translate(String, String)}.
+ * The {@link GoogleTranslator} is configured with a target
* language of "en-US".
* @author lewismc
*
@@ -42,7 +43,7 @@ public class TranscribeTranslateExample {
/**
* Use {@link GoogleTranslator} to execute translation on
* input data. This implementation needs configured as explained in the
Javadoc.
- * In this implementation, Google will try to guess the input language.
The target
+ * In this implementation, Google will try to guess the input language.
The target
* language is "en-US".
* @param text input text to translate.
* @return translated text String.
@@ -61,43 +62,55 @@ public class TranscribeTranslateExample {
}
/**
- * Use {@link AmazonTranscribe} to execute transcription on input data.
- * This implementation needs configured as explained in the Javadoc.
+ * Use {@link org.apache.tika.parser.transcribe.aws.AmazonTranscribe} to
execute transcription
+ * on input data.
+ * This implementation needs to be configured as explained in the Javadoc.
* @param file the name of the file (which needs to be on the Java
Classpath) to transcribe.
* @return transcribed text.
*/
- public static String amazonTranscribe(String file) {
- String filePath =
TranscribeTranslateExample.class.getClassLoader().getResource(file).getPath();
- String result = null;
- Transcriber transcriber = new AmazonTranscribe();
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new FileInputStream(filePath));
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- return result;
+ public static String amazonTranscribe(Path tikaConfig, Path file) throws
Exception {
+ return new Tika(new TikaConfig(tikaConfig)).parseToString(file);
}
/**
* Main method to run this example. This program can be invoked as follows
* <ol>
- * <li><code>transcribe-translate ${file}</code>; which executes both
- * transcription then translation on the given resource, or
- * <li><code>transcribe ${file}</code>; which executes only
translation</li>
- * @param args either of the commands described above and the input file
- * (which needs to be on the Java Classpath).
+ * <li><code>transcribe-translate ${tika-config.xml} ${file}</code>; which
executes both
+ * transcription then translation on the given resource, or
+ * <li><code>transcribe ${tika-config.xml} ${file}</code>; which executes
only translation</li>
+ * @param args either of the commands described above and the input file
+ * (which needs to be on the Java Classpath).
+ *
+ *
+ *
+ * ${tika-config.xml} must include credentials for aws and a temporary
storage bucket:
+ * <pre>
+ * {@code
+ * <properties>
+ * <parsers>
+ * <parser class="org.apache.tika.parser.DefaultParser"/>
+ * <parser
class="org.apache.tika.parser.transcribe.aws.AmazonTranscribe">
+ * <params>
+ * <param name="bucket" type="string">bucket</param>
+ * <param name="clientId" type="string">clientId</param>
+ * <param name="clientSecret" type="string">clientSecret</param>
+ * </params>
+ * </parser>
+ * </parsers>
+ * </properties>
+ * }
+ * </pre>
*/
- public static void main (String[] args) {
+ public static void main (String[] args) throws Exception {
String text = null;
- if (args.length != 0) {
- if ("transcribe-translate".equals(args[0])) {
- text = googleTranslateToEnglish(amazonTranscribe(args[1]));
- System.out.print("Transcription and translation
successful!\nEXTRAXCTED TEXT: " + text);
- } else if ("transcribe".equals(args[0])) {
- text = amazonTranscribe(args[1]);
- System.out.print("Transcription successful!\nEXTRAXCTED TEXT:
" + text);
+ if (args.length > 1) {
+ if ("transcribe-translate".equals(args[1])) {
+ text =
googleTranslateToEnglish(amazonTranscribe(Paths.get(args[0]),
+ Paths.get(args[1])));
+ System.out.print("Transcription and translation
successful!\nEXTRACTED TEXT: " + text);
+ } else if ("transcribe".equals(args[1])) {
+ text = amazonTranscribe(Paths.get(args[0]),
Paths.get(args[1]));
+ System.out.print("Transcription successful!\nEXTRACTED TEXT: "
+ text);
} else {
System.out.print("Incorrect invocation, see Javadoc.");
}
diff --git a/tika-parsers/tika-parsers-ml/pom.xml
b/tika-parsers/tika-parsers-ml/pom.xml
index ba9bd38..2dcde9e 100644
--- a/tika-parsers/tika-parsers-ml/pom.xml
+++ b/tika-parsers/tika-parsers-ml/pom.xml
@@ -40,6 +40,7 @@
<module>tika-age-recogniser</module>
<module>tika-parser-advancedmedia-module</module>
<module>tika-dl</module>
+ <module>tika-transcribe-aws</module>
</modules>
<build>
diff --git a/tika-transcribe/pom.xml
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml
similarity index 78%
rename from tika-transcribe/pom.xml
rename to tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml
index aadb137..1e287c5 100644
--- a/tika-transcribe/pom.xml
+++ b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml
@@ -25,20 +25,19 @@
<modelVersion>4.0.0</modelVersion>
<parent>
+ <artifactId>tika-parsers-ml</artifactId>
<groupId>org.apache.tika</groupId>
- <artifactId>tika-parent</artifactId>
<version>2.0.0-SNAPSHOT</version>
- <relativePath>../tika-parent/pom.xml</relativePath>
</parent>
- <artifactId>tika-transcribe</artifactId>
+ <artifactId>tika-transcribe-aws</artifactId>
<packaging>bundle</packaging>
- <name>Apache Tika transcribe</name>
+ <name>Apache Tika transcribe aws</name>
<url>http://tika.apache.org/</url>
<!--TODO use latest aws version or the one defined in the tika-parent-->
<dependencies>
<dependency>
- <groupId>org.apache.tika</groupId>
+ <groupId>${project.groupId}</groupId>
<artifactId>tika-core</artifactId>
<version>${project.version}</version>
</dependency>
@@ -55,9 +54,37 @@
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-core</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ </exclusion>
</exclusions>
</dependency>
<dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-core</artifactId>
+ <version>${jackson.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ <version>${jackson.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>${commons.logging.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>${commons.codec.version}</version>
+ </dependency>
+ <dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
<version>${aws.version}</version>
@@ -71,6 +98,14 @@
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ <type>test-jar</type>
</dependency>
</dependencies>
<build>
@@ -111,7 +146,7 @@
<configuration>
<archive>
<manifestEntries>
-
<Automatic-Module-Name>org.apache.tika.translate</Automatic-Module-Name>
+
<Automatic-Module-Name>org.apache.tika.parser.transcribe.aws</Automatic-Module-Name>
</manifestEntries>
</archive>
</configuration>
diff --git
a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java
new file mode 100644
index 0000000..91e8452
--- /dev/null
+++
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java
@@ -0,0 +1,398 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.transcribe.aws;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.stream.Collectors;
+
+import com.amazonaws.AmazonServiceException;
+import com.amazonaws.SdkClientException;
+import com.amazonaws.auth.AWSStaticCredentialsProvider;
+import com.amazonaws.auth.BasicAWSCredentials;
+import com.amazonaws.services.s3.AmazonS3;
+import com.amazonaws.services.s3.AmazonS3ClientBuilder;
+import com.amazonaws.services.s3.model.AmazonS3Exception;
+import com.amazonaws.services.s3.model.CompressionType;
+import com.amazonaws.services.s3.model.ExpressionType;
+import com.amazonaws.services.s3.model.InputSerialization;
+import com.amazonaws.services.s3.model.JSONInput;
+import com.amazonaws.services.s3.model.JSONOutput;
+import com.amazonaws.services.s3.model.JSONType;
+import com.amazonaws.services.s3.model.OutputSerialization;
+import com.amazonaws.services.s3.model.PutObjectRequest;
+import com.amazonaws.services.s3.model.PutObjectResult;
+import com.amazonaws.services.s3.model.SelectObjectContentEvent;
+import com.amazonaws.services.s3.model.SelectObjectContentEventVisitor;
+import com.amazonaws.services.s3.model.SelectObjectContentRequest;
+import com.amazonaws.services.s3.model.SelectObjectContentResult;
+import com.amazonaws.services.transcribe.AmazonTranscribeAsync;
+import com.amazonaws.services.transcribe.AmazonTranscribeAsyncClientBuilder;
+import com.amazonaws.services.transcribe.model.GetTranscriptionJobRequest;
+import com.amazonaws.services.transcribe.model.GetTranscriptionJobResult;
+import com.amazonaws.services.transcribe.model.LanguageCode;
+import com.amazonaws.services.transcribe.model.Media;
+import com.amazonaws.services.transcribe.model.StartTranscriptionJobRequest;
+import com.amazonaws.services.transcribe.model.TranscriptionJob;
+import com.amazonaws.services.transcribe.model.TranscriptionJobStatus;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+/**
+ * <a href="https://aws.amazon.com/transcribe/">Amazon Transcribe</a>
+ * implementation. See Javadoc for configuration options.
+ * <p>
+ * Silently becomes unavailable when client keys are unavailable.
+ *
+ * <b>N.B.</b> it is not necessary to create the bucket before hand.
+ * This implementation will automatically create the bucket if one
+ * does not already exist, per the name defined above.
+ *
+ * @since Tika 2.0
+ */
+
+public class AmazonTranscribe extends AbstractParser implements Initializable {
+ private static final Logger LOG =
LoggerFactory.getLogger(AmazonTranscribe.class);
+ private AmazonTranscribeAsync amazonTranscribeAsync;
+ private AmazonS3 amazonS3;
+ private String bucketName;
+ private String region;
+ private boolean isAvailable; // Flag for whether or not transcription is
+ // available.
+ private String clientId;
+ private String clientSecret; // Keys used for the API calls.
+ private AWSStaticCredentialsProvider credsProvider;
+
+ //https://docs.aws.amazon.com/transcribe/latest/dg/input.html
+ protected static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(
+ new HashSet<>(Arrays.asList(MediaType.audio("x-flac"),
MediaType.audio("mp3"),
+ MediaType.audio("mpeg"), MediaType.video("ogg"),
MediaType.audio("vnd.wave"),
+ MediaType.audio("mp4"), MediaType.video("mp4"),
MediaType.application("mp4"),
+ MediaType.video("quicktime"))));
+
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ if (!isAvailable) {
+ return Collections.EMPTY_SET;
+ }
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * Starts AWS Transcribe Job with language specification.
+ *
+ * @param stream the source input stream.
+ * @param handler handler to use
+ * @param metadata
+ * @param context -- set the {@link LanguageCode} in the ParseContext if
known
+ * @throws TikaException When there is an error transcribing.
+ * @throws IOException If an I/O exception of some sort has occurred.
+ * @see <a href=
+ *
"https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS
+ * Language Code</a>
+ */
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata,
+ ParseContext context) throws IOException, SAXException,
TikaException {
+
+ if (!isAvailable) {
+ return;
+ }
+ String jobName = getJobKey();
+ LanguageCode languageCode = context.get(LanguageCode.class);
+ uploadFileToBucket(stream, jobName);
+ StartTranscriptionJobRequest startTranscriptionJobRequest =
+ new StartTranscriptionJobRequest();
+ Media media = new Media();
+ media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString());
+
startTranscriptionJobRequest.withMedia(media).withOutputBucketName(this.bucketName)
+
.withTranscriptionJobName(jobName).setRequestCredentialsProvider(credsProvider);
+
+ if (languageCode != null) {
+ startTranscriptionJobRequest.withLanguageCode(languageCode);
+ } else {
+ startTranscriptionJobRequest.withIdentifyLanguage(true);
+ }
+
amazonTranscribeAsync.startTranscriptionJob(startTranscriptionJobRequest);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ String text = getTranscriptText(jobName);
+ xhtml.startElement("p");
+ xhtml.characters(text);
+ xhtml.endElement("p");
+ xhtml.endDocument();
+
+ }
+
+
+ /**
+ * @return true if this Transcriber is probably able to transcribe right
+ * now.
+ * @since Tika 2.1
+ */
+ public boolean isAvailable() {
+ return this.isAvailable;
+ }
+
+ /**
+ * Sets the client Id for the transcriber API.
+ *
+ * @param id The ID to set.
+ */
+ @Field
+ public void setClientId(String id) {
+ this.clientId = id;
+ this.isAvailable = checkAvailable();
+ }
+
+ /**
+ * Sets the client secret for the transcriber API.
+ *
+ * @param secret The secret to set.
+ */
+ @Field
+ public void setClientSecret(String secret) {
+ this.clientSecret = secret;
+ this.isAvailable = checkAvailable();
+ }
+
+ /**
+ * Sets the client secret for the transcriber API.
+ *
+ * @param bucket The bucket to set.
+ */
+ @Field
+ public void setBucket(String bucket) {
+ this.bucketName = bucket;
+ this.isAvailable = checkAvailable();
+ }
+
+ @Field
+ public void setRegion(String region) {
+ this.region = region;
+ this.isAvailable = checkAvailable();
+ }
+
+ /**
+ * Private method check if the service is available.
+ *
+ * @return if the service is available
+ */
+ private boolean checkAvailable() {
+ return clientId != null && clientSecret != null && bucketName != null;
+ }
+
+ /**
+ * private method to get a unique job key.
+ *
+ * @return unique job key.
+ */
+ private String getJobKey() {
+ return UUID.randomUUID().toString();
+ }
+
+ /**
+ * Constructs a new {@link PutObjectRequest} object to upload a file to the
+ * specified bucket and jobName. After constructing the request, users may
+ * optionally specify object metadata or a canned ACL as well.
+ *
+ * @param inputStream, null
+ * The file to upload to Amazon S3.
+ * @param jobName The unique job name for each job(UUID).
+ */
+ private void uploadFileToBucket(InputStream inputStream, String jobName)
throws TikaException {
+ PutObjectRequest request =
+ new PutObjectRequest(this.bucketName, jobName, inputStream,
null);
+ try {
+ @SuppressWarnings("unused") PutObjectResult response =
amazonS3.putObject(request);
+ } catch (SdkClientException e) {
+ throw (new TikaException("File Upload to AWS Failed"));
+ }
+ }
+
+ /**
+ * Gets Transcription result from AWS S3 bucket given the jobName.
+ *
+ * @param fileNameS3 The path of the file to upload to Amazon S3.
+ * @return The transcribed string result, NULL if the job failed.
+ * @throws IOException possible reasons include (i) an End
Event is not received
+ * from AWS S3 SelectObjectContentResult
operation and (ii) a parse exception
+ * whilst processing JSON from the AWS S3
SelectObjectContentResult operation.
+ * @throws SdkClientException a AWS-specific exception related to
SelectObjectContentResult
+ * operation.
+ * @throws AmazonServiceException possibly thrown if there is an issue
selecting object content
+ * from AWS S3 objects.
+ */
+ private String getTranscriptText(String fileNameS3)
+ throws AmazonServiceException, SdkClientException, IOException {
+ TranscriptionJob transcriptionJob =
retrieveObjectWhenJobCompleted(fileNameS3);
+ String text = null;
+ if (transcriptionJob != null && !TranscriptionJobStatus.FAILED.name()
+ .equals(transcriptionJob.getTranscriptionJobStatus())) {
+ InputSerialization inputSerialization =
+ new InputSerialization().withJson(new
JSONInput().withType(JSONType.DOCUMENT))
+ .withCompressionType(CompressionType.NONE);
+ OutputSerialization outputSerialization =
+ new OutputSerialization().withJson(new JSONOutput());
+ SelectObjectContentRequest request =
+ new
SelectObjectContentRequest().withBucketName(this.bucketName)
+ .withKey(fileNameS3 + ".json").withExpression(
+ "Select s.results.transcripts[0].transcript from
S3Object s")
+ //WHERE transcript IS NOT MISSING
+ .withExpressionType(ExpressionType.SQL)
+ .withRequestCredentialsProvider(credsProvider);
+ request.setInputSerialization(inputSerialization);
+ request.setOutputSerialization(outputSerialization);
+
+ final AtomicBoolean isResultComplete = new AtomicBoolean(false);
+
+ try (SelectObjectContentResult result =
amazonS3.selectObjectContent(request)) {
+ InputStream resultInputStream = result.getPayload()
+ .getRecordsInputStream(new
SelectObjectContentEventVisitor() {
+ @Override
+ public void
visit(SelectObjectContentEvent.StatsEvent event) {
+ LOG.debug("Received Stats, Bytes Scanned: " +
+ event.getDetails().getBytesScanned() +
+ " Bytes Processed: " +
+
event.getDetails().getBytesProcessed());
+ }
+
+ /*
+ * An End Event informs that the request has
+ * finished successfully.
+ */
+ @Override
+ public void
visit(SelectObjectContentEvent.EndEvent event) {
+ isResultComplete.set(true);
+ LOG.debug("Received End Event. Result is
complete.");
+ }
+ });
+ text = new BufferedReader(
+ new InputStreamReader(resultInputStream,
StandardCharsets.UTF_8)).lines()
+ .collect(Collectors.joining("\n"));
+ }
+ /*
+ * The End Event indicates all matching records have been
+ * transmitted. If the End Event is not received, the results
+ * may be incomplete.
+ */
+ if (!isResultComplete.get()) {
+ throw new IOException(
+ "S3 Select request was incomplete as End Event was not
received.");
+ }
+ }
+ JSONParser parser = new JSONParser();
+ JSONObject obj = null;
+ try {
+ obj = (JSONObject) parser.parse(text);
+ } catch (ParseException e) {
+ throw new IOException(e.getMessage(), e);
+ }
+ return obj.get("transcript").toString();
+ }
+
+ /**
+ * Private helper function to get object from s3.
+ *
+ * @param jobName The unique job name for each job(UUID).
+ * @return TranscriptionJob object
+ */
+ private TranscriptionJob retrieveObjectWhenJobCompleted(String jobName) {
+ GetTranscriptionJobRequest getTranscriptionJobRequest = new
GetTranscriptionJobRequest();
+
getTranscriptionJobRequest.withRequestCredentialsProvider(credsProvider);
+ getTranscriptionJobRequest.setTranscriptionJobName(jobName);
+ while (true) {
+ GetTranscriptionJobResult innerResult =
+
amazonTranscribeAsync.getTranscriptionJob(getTranscriptionJobRequest);
+ String status =
innerResult.getTranscriptionJob().getTranscriptionJobStatus();
+ if (TranscriptionJobStatus.COMPLETED.name().equals(status) ||
+ TranscriptionJobStatus.FAILED.name().equals(status)) {
+ return innerResult.getTranscriptionJob();
+ }
+ }
+ }
+
+ @Override
+ public void initialize(Map<String, Param> params) throws
TikaConfigException {
+ if (!checkAvailable()) {
+ return;
+ }
+
+ try {
+ BasicAWSCredentials creds = new BasicAWSCredentials(this.clientId,
this.clientSecret);
+ this.credsProvider = new AWSStaticCredentialsProvider(creds);
+ if (region != null) {
+ this.amazonS3 =
AmazonS3ClientBuilder.standard().withCredentials(credsProvider)
+ .withRegion(this.region).build();
+ } else {
+ this.amazonS3 =
+
AmazonS3ClientBuilder.standard().withCredentials(credsProvider).build();
+
+ }
+ if (!this.amazonS3.doesBucketExistV2(this.bucketName)) {
+ try {
+ amazonS3.createBucket(this.bucketName);
+ } catch (AmazonS3Exception e) {
+ throw new TikaConfigException("couldn't create bucket", e);
+ }
+ }
+ this.amazonTranscribeAsync =
+
AmazonTranscribeAsyncClientBuilder.standard().withCredentials(credsProvider)
+ .withRegion(this.region).build();
+ } catch (Exception e) {
+ LOG.warn("Exception reading config file", e);
+ isAvailable = false;
+ }
+
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler problemHandler)
+ throws TikaConfigException {
+ //TODO alert user if they've gotten 1 or 2 out of three?
+ this.isAvailable = checkAvailable();
+ }
+}
diff --git
a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java
new file mode 100644
index 0000000..be4f76a
--- /dev/null
+++
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java
@@ -0,0 +1,310 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.transcribe.aws;
+
+import java.io.InputStream;
+
+import com.amazonaws.services.transcribe.model.LanguageCode;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
+//TODO: Check the ACTUAL output of Amazon Transcribe.
+
+/**
+ * Tests tika-trancribe by creating an AmazonTranscribe() object.
+ * 1) Tests that transcribe functions properly when it is given just a
filepath.
+ * 2) Both audio (mp3) and video (mp4) files are used in these tests.
+ */
+@Ignore("Ignore until finalize AmazonTrancsribe Interface & build Tika")
+public class AmazonTranscribeTest extends TikaTest {
+
+ static Parser PARSER;
+
+ @BeforeClass
+ public static void setUp() throws Exception {
+ try (InputStream is = AmazonTranscribeTest.class
+ .getResourceAsStream("tika-config-aws-transcribe.xml")) {
+ PARSER = new TikaConfig(is).getParser();
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is en-US (English - United States)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_enUS() throws Exception {
+ ParseContext context = new ParseContext();
+ context.set(LanguageCode.class, LanguageCode.EnUS);
+ String xml = getXML("en-US_(A_Little_Bottle_Of_Water).mp3", PARSER,
context).xml;
+ String expected = "a little bottle of water.";
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is en-US (English - United States)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_enUS() throws Exception {
+ String xml = getXML("en-US_(A_Little_Bottle_Of_Water).mp3",
PARSER).xml;
+ String expected = "a little bottle of water.";
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is en-US (English - United States)
+ */
+ @Test
+ public void testAmazonTranscribeVideo_enUS() throws Exception {
+ String expected = "Hi";
+ ParseContext context = new ParseContext();
+ context.set(LanguageCode.class, LanguageCode.EnUS);
+ String xml = getXML("en-US_(Hi).mp4", PARSER, context).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with a video file without passing in the source
language.
+ * The source language of the file is en-US (English - United States)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownVideo_enUS() throws Exception {
+ String expected = "Hi";
+ String xml = getXML("en-US_(Hi).mp4", PARSER).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is en-GB (English - Great Britain)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_enGB() throws Exception {
+ String file = "en-GB_(A_Little_Bottle_Of_Water).mp3";
+ String expected = "a little bottle of water.";
+ ParseContext context = new ParseContext();
+ context.set(LanguageCode.class, LanguageCode.EnGB);
+ String xml = getXML(file, PARSER, context).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is en-GB (English - Great Britain)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_enGB() throws Exception {
+ String file = "en-GB_(A_Little_Bottle_Of_Water).mp3";
+ String expected = "a little bottle of water.";
+ String xml = getXML(file, PARSER).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is en-AU (English - Australia)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_enAU() throws Exception {
+ String file = "en-AU_(A_Little_Bottle_Of_Water).mp3";
+ String expected = "a little bottle of water.";
+ ParseContext context = new ParseContext();
+ context.set(LanguageCode.class, LanguageCode.EnAU);
+ String xml = getXML(file, PARSER, context).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is en-AU (English - Australian)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_enAU() throws Exception {
+ String file = "en-AU_(A_Little_Bottle_Of_Water).mp3";
+ String expected = "a little bottle of water.";
+ String xml = getXML(file, PARSER).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is de-DE (German)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_deDE() throws Exception {
+ String file = "de-DE_(We_Are_At_School_x2).mp3";
+ String expected = "Wir sind in der Schule. Wir sind in der Schule.";
+ ParseContext context = new ParseContext();
+ context.set(LanguageCode.class, LanguageCode.DeDE);
+ String xml = getXML(file, PARSER, context).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is de-DE (German)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_deDE() throws Exception {
+ String file = "de-DE_(We_Are_At_School_x2).mp3";
+ String expected = "Wir sind in der Schule. Wir sind in der Schule.";
+ String xml = getXML(file, PARSER).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is it-IT (Italian)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_itIT() throws Exception {
+ String file = "it-IT_(We_Are_Having_Class_x2).mp3";
+ String expected = "stiamo facendo lezione. stiamo facendo lezione.";
+ ParseContext context = new ParseContext();
+ context.set(LanguageCode.class, LanguageCode.ItIT);
+ String xml = getXML(file, PARSER, context).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is it-IT (Italian)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_itIT() throws Exception {
+ String file = "it-IT_(We_Are_Having_Class_x2).mp3";
+ String expected = "stiamo facendo lezione. stiamo facendo lezione.";
+ String xml = getXML(file, PARSER).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is ja-JP (Japanese)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_jaJP() throws Exception {
+ String file = "ja-JP_(We_Are_At_School).mp3";
+ String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu
+ ParseContext context = new ParseContext();
+ context.set(LanguageCode.class, LanguageCode.JaJP);
+ String xml = getXML(file, PARSER, context).xml;
+ assertContains(expected, xml);
+
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is ja-JP (Japanese)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_jaJP() throws Exception {
+ String file = "ja-JP_(We_Are_At_School).mp3";
+ String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu
+ String xml = getXML(file, PARSER).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is ko-KR (Korean)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_koKR() throws Exception {
+ String file = "ko-KR_(We_Are_Having_Class_x2).mp3";
+ String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda
+ ParseContext context = new ParseContext();
+ context.set(LanguageCode.class, LanguageCode.KoKR);
+ String xml = getXML(file, PARSER, context).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is ko-KR (Korean)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_koKR() throws Exception {
+ String file = "ko-KR_(We_Are_Having_Class_x2).mp3";
+ String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda
+ String xml = getXML(file, PARSER).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with a video file given the source language
+ * The source language of the file is ko-KR (Korean)
+ */
+ @Test
+ public void testAmazonTranscribeVideo_koKR() throws Exception {
+ String file = "ko-KR_(Annyeonghaseyo).mp4";
+ //TODO: Check whether output is Annyeonghaseyo or 안녕하세요
+ String expected = "Annyeonghaseyo";
+ ParseContext context = new ParseContext();
+ context.set(LanguageCode.class, LanguageCode.KoKR);
+ String xml = getXML(file, PARSER, context).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an video file without passing in the source
language.
+ * The source language of the file is ko-KR (Korean)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownVideo_koKR() throws Exception {
+ String file = "ko-KR_(Annyeonghaseyo).mp4";
+ //TODO: Check whether output is Annyeonghaseyo or 안녕하세요
+ String expected = "Annyeonghaseyo";
+ String xml = getXML(file, PARSER).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is pt-BR (Portuguese - Brazil)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_ptBR() throws Exception {
+ String file = "pt-BR_(We_Are_At_School).mp3";
+ String expected = "nós estamos na escola.";
+ ParseContext context = new ParseContext();
+ context.set(LanguageCode.class, LanguageCode.PtBR);
+ String xml = getXML(file, PARSER, context).xml;
+ assertContains(expected, xml);
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is pt-BR (Portuguese - Brazil)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_ptBR() throws Exception {
+ String file = "pt-BR_(We_Are_At_School).mp3";
+ String expected = "nós estamos na escola.";
+ String xml = getXML(file, PARSER).xml;
+ assertContains(expected, xml);
+ }
+
+}
diff --git a/tika-transcribe/src/test/resources/ShortAudioSampleFrench.mp3
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ShortAudioSampleFrench.mp3
similarity index 100%
rename from tika-transcribe/src/test/resources/ShortAudioSampleFrench.mp3
rename to
tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ShortAudioSampleFrench.mp3
diff --git a/tika-transcribe/src/test/resources/de-DE_(We_Are_At_School_x2).mp3
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/de-DE_(We_Are_At_School_x2).mp3
similarity index 100%
rename from tika-transcribe/src/test/resources/de-DE_(We_Are_At_School_x2).mp3
rename to
tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/de-DE_(We_Are_At_School_x2).mp3
diff --git
a/tika-transcribe/src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-AU_(A_Little_Bottle_Of_Water).mp3
similarity index 100%
rename from
tika-transcribe/src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3
rename to
tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-AU_(A_Little_Bottle_Of_Water).mp3
diff --git
a/tika-transcribe/src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-GB_(A_Little_Bottle_Of_Water).mp3
similarity index 100%
rename from
tika-transcribe/src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3
rename to
tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-GB_(A_Little_Bottle_Of_Water).mp3
diff --git
a/tika-transcribe/src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-US_(A_Little_Bottle_Of_Water).mp3
similarity index 100%
rename from
tika-transcribe/src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3
rename to
tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-US_(A_Little_Bottle_Of_Water).mp3
diff --git a/tika-transcribe/src/test/resources/en-US_(Hi).mp4
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-US_(Hi).mp4
similarity index 100%
rename from tika-transcribe/src/test/resources/en-US_(Hi).mp4
rename to
tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-US_(Hi).mp4
diff --git
a/tika-transcribe/src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/it-IT_(We_Are_Having_Class_x2).mp3
similarity index 100%
rename from
tika-transcribe/src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3
rename to
tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/it-IT_(We_Are_Having_Class_x2).mp3
diff --git a/tika-transcribe/src/test/resources/ja-JP_(We_Are_At_School).mp3
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ja-JP_(We_Are_At_School).mp3
similarity index 100%
rename from tika-transcribe/src/test/resources/ja-JP_(We_Are_At_School).mp3
rename to
tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ja-JP_(We_Are_At_School).mp3
diff --git a/tika-transcribe/src/test/resources/ko-KR_(Annyeonghaseyo).mp4
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ko-KR_(Annyeonghaseyo).mp4
similarity index 100%
rename from tika-transcribe/src/test/resources/ko-KR_(Annyeonghaseyo).mp4
rename to
tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ko-KR_(Annyeonghaseyo).mp4
diff --git
a/tika-transcribe/src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ko-KR_(We_Are_Having_Class_x2).mp3
similarity index 100%
rename from
tika-transcribe/src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3
rename to
tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ko-KR_(We_Are_Having_Class_x2).mp3
diff --git a/tika-transcribe/src/test/resources/pt-BR_(We_Are_At_School).mp3
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/pt-BR_(We_Are_At_School).mp3
similarity index 100%
rename from tika-transcribe/src/test/resources/pt-BR_(We_Are_At_School).mp3
rename to
tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/pt-BR_(We_Are_At_School).mp3
diff --git
a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/tika-config-transcribe-aws.xml
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/tika-config-transcribe-aws.xml
new file mode 100644
index 0000000..875fe5b
--- /dev/null
+++
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/tika-config-transcribe-aws.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.transcribe.aws.AmazonTranscribe">
+ <params>
+ <!-- first three are required -->
+ <param name="bucket" type="string">bucket</param>
+ <param name="clientId" type="string">clientId</param>
+ <param name="clientSecret" type="string">clientSecret</param>
+ <!-- region is optional -->
+ <param name="region" type="string">region</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
\ No newline at end of file
diff --git
a/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
b/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
deleted file mode 100644
index 5b50491..0000000
---
a/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.transcribe;
-
-import com.amazonaws.AmazonServiceException;
-import com.amazonaws.SdkClientException;
-import com.amazonaws.auth.AWSStaticCredentialsProvider;
-import com.amazonaws.auth.BasicAWSCredentials;
-import com.amazonaws.services.s3.AmazonS3;
-import com.amazonaws.services.s3.AmazonS3ClientBuilder;
-import com.amazonaws.services.s3.model.AmazonS3Exception;
-import com.amazonaws.services.s3.model.CompressionType;
-import com.amazonaws.services.s3.model.ExpressionType;
-import com.amazonaws.services.s3.model.InputSerialization;
-import com.amazonaws.services.s3.model.JSONInput;
-import com.amazonaws.services.s3.model.JSONOutput;
-import com.amazonaws.services.s3.model.JSONType;
-import com.amazonaws.services.s3.model.OutputSerialization;
-import com.amazonaws.services.s3.model.PutObjectRequest;
-import com.amazonaws.services.s3.model.PutObjectResult;
-import com.amazonaws.services.s3.model.SelectObjectContentEvent;
-import com.amazonaws.services.s3.model.SelectObjectContentEventVisitor;
-import com.amazonaws.services.s3.model.SelectObjectContentRequest;
-import com.amazonaws.services.s3.model.SelectObjectContentResult;
-import com.amazonaws.services.transcribe.AmazonTranscribeAsync;
-import com.amazonaws.services.transcribe.AmazonTranscribeAsyncClientBuilder;
-import com.amazonaws.services.transcribe.model.Media;
-import com.amazonaws.services.transcribe.model.StartTranscriptionJobRequest;
-import com.amazonaws.services.transcribe.model.TranscriptionJob;
-import com.amazonaws.services.transcribe.model.TranscriptionJobStatus;
-import com.amazonaws.services.transcribe.model.GetTranscriptionJobRequest;
-import com.amazonaws.services.transcribe.model.GetTranscriptionJobResult;
-import com.amazonaws.services.transcribe.model.LanguageCode;
-import org.apache.tika.exception.TikaException;
-import org.json.simple.JSONObject;
-import org.json.simple.parser.JSONParser;
-import org.json.simple.parser.ParseException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-import java.util.Properties;
-import java.util.UUID;
-import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.stream.Collectors;
-
-/**
- * <a href="https://aws.amazon.com/transcribe/">Amazon Transcribe</a>
- * {@link Transcriber} implementation. See Javadoc for configiration options.
- *
- * @since Tika 2.1
- */
-public class AmazonTranscribe implements Transcriber {
-
- public static final String PROPERTIES_FILE =
"transcribe.amazon.properties";
- public static final String ID_PROPERTY = "transcribe.AWS_ACCESS_KEY";
- public static final String SECRET_PROPERTY = "transcribe.AWS_SECRET_KEY";
- public static final String DEFAULT_ID = "dummy-id";
- public static final String DEFAULT_SECRET = "dummy-secret";
- public static final String DEFAULT_BUCKET = "dummy-bucket";
- public static final String BUCKET_NAME = "transcribe.BUCKET_NAME";
- public static final String REGION = "transcribe.REGION";
- private static final Logger LOG = LoggerFactory
- .getLogger(AmazonTranscribe.class);
- private AmazonTranscribeAsync amazonTranscribeAsync;
- private AmazonS3 amazonS3;
- private String bucketName;
- private String region;
- private boolean isAvailable; // Flag for whether or not transcription is
- // available.
- private String clientId;
- private String clientSecret; // Keys used for the API calls.
- private AWSStaticCredentialsProvider credsProvider;
-
- /**
- * Create a new AmazonTranscribe instance with the client keys specified in
- * <code>transcribe.amazon.properties</code> which needs to be available on
- * the Java Classpath.
- * Silently becomes unavailable when client keys are unavailable.
- * <code>transcribe.AWS_ACCESS_KEY</code>,
- * <code>transcribe.AWS_SECRET_KEY</code>,
- * <code>transcribe.BUCKET_NAME</code> and
- * <code>transcribe.REGION</code> must be set in
- * <code>transcribe.amazon.properties</code>.
- * <b>N.B.</b> it is not necessary to create the bucket before hand.
- * This implementation will automatically create the bucket if one
- * does not alrerady exist, per the name defined above.
- *
- * @since Tika 2.0
- */
- public AmazonTranscribe() {
- Properties config = new Properties();
- try {
- config.load(AmazonTranscribe.class
- .getResourceAsStream(PROPERTIES_FILE));
- this.clientId = config.getProperty(ID_PROPERTY);
- this.clientSecret = config.getProperty(SECRET_PROPERTY);
- this.bucketName = config.getProperty(BUCKET_NAME);
- this.region = config.getProperty(REGION);
- BasicAWSCredentials creds = new BasicAWSCredentials(this.clientId,
- this.clientSecret);
- this.credsProvider = new AWSStaticCredentialsProvider(creds);
- amazonS3 = AmazonS3ClientBuilder.standard()
- .withCredentials(credsProvider).withRegion(this.region)
- .build();
- this.isAvailable = checkAvailable();
- if (!this.amazonS3.doesBucketExistV2(this.bucketName)) {
- try {
- amazonS3.createBucket(this.bucketName);
- } catch (AmazonS3Exception e) {
- throw new RuntimeException(e.getErrorMessage());
- }
- }
- this.amazonTranscribeAsync = AmazonTranscribeAsyncClientBuilder
- .standard().withCredentials(credsProvider)
- .withRegion(this.region).build();
- } catch (Exception e) {
- LOG.warn("Exception reading config file", e);
- isAvailable = false;
- }
- }
-
- /**
- * private method to get a unique job key.
- *
- * @return unique job key.
- */
- private String getJobKey() {
- return UUID.randomUUID().toString();
- }
-
- /**
- * Constructs a new {@link PutObjectRequest} object to upload a file to the
- * specified bucket and jobName. After constructing the request, users may
- * optionally specify object metadata or a canned ACL as well.
- *
- * @param inputStream, null
- * The file to upload to Amazon S3.
- * @param jobName
- * The unique job name for each job(UUID).
- */
- private void uploadFileToBucket(InputStream inputStream, String jobName)
- throws TikaException {
- PutObjectRequest request = new PutObjectRequest(this.bucketName,
- jobName, inputStream, null);
- try {
- @SuppressWarnings("unused")
- PutObjectResult response = amazonS3.putObject(request);
- } catch (SdkClientException e) {
- throw (new TikaException("File Upload to AWS Failed"));
- }
- }
-
- /**
- * Starts AWS Transcribe Job without language specification.
- *
- * @param inputStream
- * the source input stream.
- * @return The transcribed string result, NULL if the job failed.
- * @throws TikaException
- * When there is an error transcribing.
- * @throws IOException
- * If an I/O exception of some sort has occurred.
- */
- @Override
- public String transcribe(InputStream inputStream)
- throws TikaException, IOException {
- if (!isAvailable())
- return null;
- String jobName = getJobKey();
- uploadFileToBucket(inputStream, jobName);
- StartTranscriptionJobRequest startTranscriptionJobRequest = new
StartTranscriptionJobRequest();
- Media media = new Media();
- media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString());
-
startTranscriptionJobRequest.withIdentifyLanguage(true).withMedia(media)
- .withOutputBucketName(this.bucketName)
- .withTranscriptionJobName(jobName)
- .setRequestCredentialsProvider(credsProvider);
- amazonTranscribeAsync
- .startTranscriptionJob(startTranscriptionJobRequest);
- return getTranscriptText(jobName);
- }
-
- /**
- * Starts AWS Transcribe Job with language specification.
- *
- * @param inputStream
- * the source input stream.
- * @param sourceLanguage
- * <a href=
- *
"https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS
- * Language Code</a> for the language used in the input media
- * file.
- * @return The transcribed string result, NULL if the job failed.
- * @throws TikaException
- * When there is an error transcribing.
- * @throws IOException
- * If an I/O exception of some sort has occurred.
- * @see <a href=
- *
"https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS
- * Language Code</a>
- */
- @Override
- public String transcribe(InputStream inputStream, String sourceLanguage)
- throws TikaException, IOException {
- if (!isAvailable())
- return null;
- String jobName = getJobKey();
- uploadFileToBucket(inputStream, jobName);
- StartTranscriptionJobRequest startTranscriptionJobRequest = new
StartTranscriptionJobRequest();
- Media media = new Media();
- media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString());
- ((StartTranscriptionJobRequest) startTranscriptionJobRequest
- .withMedia(media).withOutputBucketName(this.bucketName)
- .withTranscriptionJobName(jobName)
- .withRequestCredentialsProvider(credsProvider))
- .withLanguageCode(
- LanguageCode.fromValue(sourceLanguage));
- amazonTranscribeAsync
- .startTranscriptionJob(startTranscriptionJobRequest);
- return getTranscriptText(jobName);
- }
-
- /**
- * @return true if this Transcriber is probably able to transcribe right
- * now.
- * @since Tika 2.1
- */
- @Override
- public boolean isAvailable() {
- return this.isAvailable;
- }
-
- /**
- * Sets the client Id for the transcriber API.
- *
- * @param id
- * The ID to set.
- */
- public void setId(String id) {
- this.clientId = id;
- this.isAvailable = checkAvailable();
- }
-
- /**
- * Sets the client secret for the transcriber API.
- *
- * @param secret
- * The secret to set.
- */
- public void setSecret(String secret) {
- this.clientSecret = secret;
- this.isAvailable = checkAvailable();
- }
-
- /**
- * Sets the client secret for the transcriber API.
- *
- * @param bucket
- * The bucket to set.
- */
- public void setBucket(String bucket) {
- this.bucketName = bucket;
- this.isAvailable = checkAvailable();
- }
-
- /**
- * Private method check if the service is available.
- *
- * @return if the service is available
- */
- private boolean checkAvailable() {
- return clientId != null && !clientId.equals(DEFAULT_ID)
- && clientSecret != null && !clientSecret.equals(DEFAULT_SECRET)
- && bucketName != null && !bucketName.equals(DEFAULT_BUCKET);
- }
-
- /**
- * Gets Transcription result from AWS S3 bucket given the jobName.
- *
- * @param fileNameS3
- * The path of the file to upload to Amazon S3.
- * @return The transcribed string result, NULL if the job failed.
- * @throws IOException possible reasons include (i) an End Event is not
received
- * from AWS S3 SelectObjectContentResult operation and (ii) a parse
exception
- * whilst processing JSON from the AWS S3 SelectObjectContentResult
operation.
- * @throws SdkClientException a AWS-specific exception related to
SelectObjectContentResult
- * operation.
- * @throws AmazonServiceException possibly thrown if there is an issue
selecting object content
- * from AWS S3 objects.
- */
- private String getTranscriptText(String fileNameS3) throws
AmazonServiceException, SdkClientException, IOException {
- TranscriptionJob transcriptionJob = retrieveObjectWhenJobCompleted(
- fileNameS3);
- String text = null;
- if (transcriptionJob != null && !TranscriptionJobStatus.FAILED.name()
- .equals(transcriptionJob.getTranscriptionJobStatus())) {
- InputSerialization inputSerialization = new
InputSerialization().withJson(new JSONInput().withType(JSONType.DOCUMENT))
- .withCompressionType(CompressionType.NONE);
- OutputSerialization outputSerialization = new
OutputSerialization().withJson(new JSONOutput());
- SelectObjectContentRequest request = new
SelectObjectContentRequest()
- .withBucketName(this.bucketName).withKey(fileNameS3 +
".json")
- .withExpression("Select
s.results.transcripts[0].transcript from S3Object s")//WHERE transcript IS NOT
MISSING
-
.withExpressionType(ExpressionType.SQL).withRequestCredentialsProvider(credsProvider);
- request.setInputSerialization(inputSerialization);
- request.setOutputSerialization(outputSerialization);
-
- final AtomicBoolean isResultComplete = new AtomicBoolean(false);
-
- try (SelectObjectContentResult result = amazonS3
- .selectObjectContent(request)) {
- InputStream resultInputStream = result.getPayload()
- .getRecordsInputStream(
- new SelectObjectContentEventVisitor() {
- @Override
- public void visit(
-
SelectObjectContentEvent.StatsEvent event) {
- LOG.debug(
- "Received Stats, Bytes
Scanned: "
- + event.getDetails()
- .getBytesScanned()
- + " Bytes Processed: "
- + event.getDetails()
- .getBytesProcessed());
- }
-
- /*
- * An End Event informs that the request
has
- * finished successfully.
- */
- @Override
- public void visit(
- SelectObjectContentEvent.EndEvent
event) {
- isResultComplete.set(true);
- LOG.debug(
- "Received End Event. Result is
complete.");
- }
- });
- text = new BufferedReader(
- new InputStreamReader(resultInputStream,
StandardCharsets.UTF_8))
- .lines()
- .collect(Collectors.joining("\n"));
- }
- /*
- * The End Event indicates all matching records have been
- * transmitted. If the End Event is not received, the results
- * may be incomplete.
- */
- if (!isResultComplete.get()) {
- throw new IOException(
- "S3 Select request was incomplete as End Event was not
received.");
- }
- }
- JSONParser parser = new JSONParser();
- JSONObject obj = null;
- try {
- obj = (JSONObject) parser.parse(text);
- } catch (ParseException e) {
- throw new IOException(e.getMessage(), e);
- }
- return obj.get("transcript").toString();
- }
-
- /**
- * Private helper function to get object from s3.
- *
- * @param jobName
- * The unique job name for each job(UUID).
- * @return TranscriptionJob object
- */
- private TranscriptionJob retrieveObjectWhenJobCompleted(String jobName) {
- GetTranscriptionJobRequest getTranscriptionJobRequest = new
GetTranscriptionJobRequest();
- getTranscriptionJobRequest
- .withRequestCredentialsProvider(credsProvider);
- getTranscriptionJobRequest.setTranscriptionJobName(jobName);
- while (true) {
- GetTranscriptionJobResult innerResult = amazonTranscribeAsync
- .getTranscriptionJob(getTranscriptionJobRequest);
- String status = innerResult.getTranscriptionJob()
- .getTranscriptionJobStatus();
- if (TranscriptionJobStatus.COMPLETED.name().equals(status)
- || TranscriptionJobStatus.FAILED.name().equals(status)) {
- return innerResult.getTranscriptionJob();
- }
- }
- }
-}
\ No newline at end of file
diff --git
a/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator
b/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator
deleted file mode 100644
index 1256ab6..0000000
---
a/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-org.apache.tika.language.translate.amazontranscribe
diff --git
a/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties
b/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties
deleted file mode 100644
index 043a66f..0000000
---
a/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-transcribe.AWS_ACCESS_KEY=dummy_key
-transcribe.AWS_SECRET_KEY=dummy_key
-transcribe.BUCKET_NAME=dummy_name
diff --git
a/tika-transcribe/src/test/java/org/apache/tika/transcribe/AmazonTranscribeTest.java
b/tika-transcribe/src/test/java/org/apache/tika/transcribe/AmazonTranscribeTest.java
deleted file mode 100644
index 3b424f9..0000000
---
a/tika-transcribe/src/test/java/org/apache/tika/transcribe/AmazonTranscribeTest.java
+++ /dev/null
@@ -1,527 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.transcribe;
-
-import org.junit.Before;
-import org.junit.Ignore;
-import org.junit.Test;
-
-import java.io.FileInputStream;
-
-import static junit.framework.TestCase.assertNotNull;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
-
-//TODO: Check the ACTUAL output of Amazon Transcribe.
-
-/**
- * Tests tika-trancribe by creating an AmazonTranscribe() object.
- * 1) Tests that transcribe functions properly when it is given just a
filepath.
- * 2) Both audio (mp3) and video (mp4) files are used in these tests.
- */
-@Ignore("Ignore until finalize AmazonTransribe Interface & build Tika")
-public class AmazonTranscribeTest {
- AmazonTranscribe transcriber;
-
- @Before
- public void setUp() {
- transcriber = new AmazonTranscribe();
- }
-
- /**
- * Tests transcribe with an audio file given the source language
- * The source language of the file is en-US (English - United States)
- */
- @Test
- public void testAmazonTranscribeAudio_enUS() {
- String audioFilePath =
"src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3";
- String expected = "a little bottle of water.";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath), "en-US");
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file without passing in the source
language.
- * The source language of the file is en-US (English - United States)
- */
- @Test
- public void testAmazonTranscribeUnknownAudio_enUS() {
- String audioFilePath =
"src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3";
- String expected = "a little bottle of water.";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath));
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file given the source language
- * The source language of the file is en-US (English - United States)
- */
- @Test
- public void testAmazonTranscribeVideo_enUS() {
- String videoFilePath = "en-US_(Hi).mp4";
- String expected = "Hi";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(videoFilePath), "en-US");
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with a video file without passing in the source
language.
- * The source language of the file is en-US (English - United States)
- */
- @Test
- public void testAmazonTranscribeUnknownVideo_enUS() {
- String videoFilePath = "en-US_(Hi).mp4";
- String expected = "Hi";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(videoFilePath));
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file given the source language
- * The source language of the file is en-GB (English - Great Britain)
- */
- @Test
- public void testAmazonTranscribeAudio_enGB() {
- String audioFilePath =
"src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3";
- String expected = "a little bottle of water.";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath), "en-GB");
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file without passing in the source
language.
- * The source language of the file is en-GB (English - Great Britain)
- */
- @Test
- public void testAmazonTranscribeUnknownAudio_enGB() {
- String audioFilePath =
"src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3";
- String expected = "a little bottle of water.";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath));
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file given the source language
- * The source language of the file is en-AU (English - Australia)
- */
- @Test
- public void testAmazonTranscribeAudio_enAU() {
- String source =
"src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3";
- String expected = "a little bottle of water.";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new FileInputStream(source),
"en-AU");
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file without passing in the source
language.
- * The source language of the file is en-AU (English - Australian)
- */
- @Test
- public void testAmazonTranscribeUnknownAudio_enAU() {
- String videoFilePath =
"src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3";
- String expected = "a little bottle of water.";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(videoFilePath));
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file given the source language
- * The source language of the file is de-DE (German)
- */
- @Test
- public void testAmazonTranscribeAudio_deDE() {
- String audioFilePath =
"src/test/resources/de-DE_(We_Are_At_School_x2).mp3";
- String expected = "Wir sind in der Schule. Wir sind in der Schule.";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath), "de-DE");
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file without passing in the source
language.
- * The source language of the file is de-DE (German)
- */
- @Test
- public void testAmazonTranscribeUnknownAudio_deDE() {
- String audioFilePath =
"src/test/resources/de-DE_(We_Are_At_School_x2).mp3";
- String expected = "Wir sind in der Schule. Wir sind in der Schule.";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath));
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file given the source language
- * The source language of the file is it-IT (Italian)
- */
- @Test
- public void testAmazonTranscribeAudio_itIT() {
- String audioFilePath =
"src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3";
- String expected = "stiamo facendo lezione. stiamo facendo lezione.";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath), "it-IT");
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file without passing in the source
language.
- * The source language of the file is it-IT (Italian)
- */
- @Test
- public void testAmazonTranscribeUnknownAudio_itIT() {
- String audioFilePath =
"src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3";
- String expected = "stiamo facendo lezione. stiamo facendo lezione.";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath));
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file given the source language
- * The source language of the file is ja-JP (Japanese)
- */
- @Test
- public void testAmazonTranscribeAudio_jaJP() {
- String audioFilePath =
"src/test/resources/ja-JP_(We_Are_At_School).mp3";
- String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath), "ja-JP");
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file without passing in the source
language.
- * The source language of the file is ja-JP (Japanese)
- */
- @Test
- public void testAmazonTranscribeUnknownAudio_jaJP() {
- String audioFilePath =
"src/test/resources/ja-JP_(We_Are_At_School).mp3";
- String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath));
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file given the source language
- * The source language of the file is ko-KR (Korean)
- */
- @Test
- public void testAmazonTranscribeAudio_koKR() {
- String audioFilePath =
"src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3";
- String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath), "ko-KR");
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file without passing in the source
language.
- * The source language of the file is ko-KR (Korean)
- */
- @Test
- public void testAmazonTranscribeUnknownAudio_koKR() {
- String audioFilePath =
"src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3";
- String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath));
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with a video file given the source language
- * The source language of the file is ko-KR (Korean)
- */
- @Test
- public void testAmazonTranscribeVideo_koKR() {
- String source = "src/test/resources/ko-KR_(Annyeonghaseyo).mp4";
- //TODO: Check whether output is Annyeonghaseyo or 안녕하세요
- String expected = "Annyeonghaseyo";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new FileInputStream(source),
"ko-KR");
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an video file without passing in the source
language.
- * The source language of the file is ko-KR (Korean)
- */
- @Test
- public void testAmazonTranscribeUnknownVideo_koKR() {
- String source = "src/test/resources/ko-KR_(Annyeonghaseyo).mp4";
- //TODO: Check whether output is Annyeonghaseyo or 안녕하세요
- String expected = "Annyeonghaseyo";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new FileInputStream(source));
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file given the source language
- * The source language of the file is pt-BR (Portuguese - Brazil)
- */
- @Test
- public void testAmazonTranscribeAudio_ptBR() {
- String audioFilePath =
"src/test/resources/pt-BR_(We_Are_At_School).mp3";
- String expected = "nós estamos na escola.";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath), "pt-BR");
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
- /**
- * Tests transcribe with an audio file without passing in the source
language.
- * The source language of the file is pt-BR (Portuguese - Brazil)
- */
- @Test
- public void testAmazonTranscribeUnknownAudio_ptBR() {
- String audioFilePath =
"src/test/resources/pt-BR_(We_Are_At_School).mp3";
- String expected = "nós estamos na escola.";
- String result;
-
- if (transcriber.isAvailable()) {
- try {
- result = transcriber.transcribe(new
FileInputStream(audioFilePath));
- assertNotNull(result);
- assertEquals("Result: [" + result
- + "]: not equal to expected: [" + expected + "]",
- expected, result);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- }
- }
-
-}