This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 2e520e8 Revert "TIKA-3384 -- convert transcribe to a traditional
parser"
2e520e8 is described below
commit 2e520e82d7c2d5088803af60cb44793abd852bea
Author: tallison <[email protected]>
AuthorDate: Tue May 18 05:49:11 2021 -0400
Revert "TIKA-3384 -- convert transcribe to a traditional parser"
This reverts commit 93d2211037b01ca237a51f83879ae35f3f76dca8.
---
pom.xml | 1 +
.../org/apache/tika/transcribe/Transcriber.java | 60 +++
tika-example/pom.xml | 8 +-
.../tika/example/TranscribeTranslateExample.java | 71 ++-
tika-parsers/tika-parsers-ml/pom.xml | 1 -
tika-transcribe/pom.xml | 159 +++++++
.../apache/tika/transcribe/AmazonTranscribe.java | 406 ++++++++++++++++
.../org.apache.tika.language.translate.Translator | 16 +
.../transcribe.amazon.properties | 18 +
.../tika/transcribe/AmazonTranscribeTest.java | 527 +++++++++++++++++++++
.../src/test/resources/ShortAudioSampleFrench.mp3 | Bin 0 -> 25861 bytes
.../test/resources/de-DE_(We_Are_At_School_x2).mp3 | Bin 0 -> 38547 bytes
.../resources/en-AU_(A_Little_Bottle_Of_Water).mp3 | Bin 0 -> 33365 bytes
.../resources/en-GB_(A_Little_Bottle_Of_Water).mp3 | Bin 0 -> 35872 bytes
.../resources/en-US_(A_Little_Bottle_Of_Water).mp3 | Bin 0 -> 29603 bytes
tika-transcribe/src/test/resources/en-US_(Hi).mp4 | Bin 0 -> 21739 bytes
.../resources/it-IT_(We_Are_Having_Class_x2).mp3 | Bin 0 -> 42219 bytes
.../test/resources/ja-JP_(We_Are_At_School).mp3 | Bin 0 -> 21699 bytes
.../src/test/resources/ko-KR_(Annyeonghaseyo).mp4 | Bin 0 -> 144151 bytes
.../resources/ko-KR_(We_Are_Having_Class_x2).mp3 | Bin 0 -> 66843 bytes
.../test/resources/pt-BR_(We_Are_At_School).mp3 | Bin 0 -> 29043 bytes
21 files changed, 1220 insertions(+), 47 deletions(-)
diff --git a/pom.xml b/pom.xml
index d0e43d4..f8c6591 100644
--- a/pom.xml
+++ b/pom.xml
@@ -52,6 +52,7 @@
<module>tika-translate</module>
<module>tika-example</module>
<module>tika-java7</module>
+ <module>tika-transcribe</module>
</modules>
<profiles>
diff --git
a/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java
b/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java
new file mode 100644
index 0000000..3546256
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.transcribe;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+
+/**
+ * Interface for Transcriber services.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-94">TIKA-94</a>
+ * @since Tika 2.1
+ */
+public interface Transcriber {
+ /**
+ * Transcribe the given file.
+ *
+ * @param inputStream the source input stream.
+ * @return The transcribed string result, NULL if the job failed.
+ * @throws TikaException When there is an error transcribing.
+ * @throws IOException If an I/O exception of some sort has occurred.
+ * @since 2.1
+ */
+ public String transcribe(InputStream inputStream) throws TikaException,
IOException;
+
+ /**
+ * Transcribe the given the file and the source language.
+ *
+ * @param inputStream the source input stream.
+ * @param sourceLanguage The language code for the language used in the
input media file.
+ * @return The transcribed string result, NULL if the job failed.
+ * @throws TikaException When there is an error transcribing.
+ * @throws IOException If an I/O exception of some sort has occurred.
+ * @since 2.1
+ */
+ public String transcribe(InputStream inputStream, String sourceLanguage)
throws TikaException, IOException;
+
+ /**
+ * @return true if this Transcriber is probably able to transcribe right
now.
+ * @since Tika 2.1
+ */
+ public boolean isAvailable();
+}
diff --git a/tika-example/pom.xml b/tika-example/pom.xml
index ce6a2b3..f12304e 100644
--- a/tika-example/pom.xml
+++ b/tika-example/pom.xml
@@ -64,13 +64,13 @@
<version>${project.version}</version>
</dependency>
<dependency>
- <groupId>${project.groupId}</groupId>
+ <groupId>org.apache.tika</groupId>
<artifactId>tika-eval-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-transcribe-aws</artifactId>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-transcribe</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
@@ -88,7 +88,7 @@
</exclusions>
</dependency>
<dependency>
- <groupId>${project.groupId}</groupId>
+ <groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
diff --git
a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
index f77af72..12dd7e5 100644
---
a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
+++
b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
@@ -17,14 +17,12 @@
package org.apache.tika.example;
-import java.nio.file.Path;
-import java.nio.file.Paths;
+import java.io.FileInputStream;
-import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.language.translate.GoogleTranslator;
import org.apache.tika.language.translate.Translator;
-import org.apache.tika.parser.transcribe.aws.AmazonTranscribe;
+import org.apache.tika.transcribe.AmazonTranscribe;
+import org.apache.tika.transcribe.Transcriber;
/**
* This example demonstrates primitive logic for
@@ -32,8 +30,8 @@ import org.apache.tika.parser.transcribe.aws.AmazonTranscribe;
* could be considered as a downstream process to
* transcription.
* We simply pass the output of
- * a call to {@link Tika#parseToString(Path)}
- * into {@link Translator#translate(String, String)}.
+ * a call to {@link Transcriber#transcribe(java.io.InputStream)}
+ * into {@link Translator#translate(String, String)}.
* The {@link GoogleTranslator} is configured with a target
* language of "en-US".
* @author lewismc
@@ -64,53 +62,42 @@ public class TranscribeTranslateExample {
/**
* Use {@link AmazonTranscribe} to execute transcription on input data.
- * This implementation needs to be configured as explained in the Javadoc.
+ * This implementation needs configured as explained in the Javadoc.
* @param file the name of the file (which needs to be on the Java
Classpath) to transcribe.
* @return transcribed text.
*/
- public static String amazonTranscribe(Path tikaConfig, Path file) throws
Exception {
- return new Tika(new TikaConfig(tikaConfig)).parseToString(file);
+ public static String amazonTranscribe(String file) {
+ String filePath =
TranscribeTranslateExample.class.getClassLoader().getResource(file).getPath();
+ String result = null;
+ Transcriber transcriber = new AmazonTranscribe();
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new FileInputStream(filePath));
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ return result;
}
/**
* Main method to run this example. This program can be invoked as follows
* <ol>
- * <li><code>transcribe-translate ${tika-config.xml} ${file}</code>; which
executes both
+ * <li><code>transcribe-translate ${file}</code>; which executes both
* transcription then translation on the given resource, or
- * <li><code>transcribe ${tika-config.xml} ${file}</code>; which executes
only translation</li>
+ * <li><code>transcribe ${file}</code>; which executes only
translation</li>
* @param args either of the commands described above and the input file
- * (which needs to be on the Java Classpath).
- *
- *
- *
- * ${tika-config.xml} must include credentials for aws and a temporary
storage bucket:
- * <pre>
- * {@code
- * <properties>
- * <parsers>
- * <parser class="org.apache.tika.parser.DefaultParser"/>
- * <parser
class="org.apache.tika.parser.transcribe.aws.AmazonTranscribe">
- * <params>
- * <param name="bucket" type="string">bucket</param>
- * <param name="clientId" type="string">clientId</param>
- * <param name="clientSecret" type="string">clientSecret</param>
- * </params>
- * </parser>
- * </parsers>
- * </properties>
- * }
- * </pre>
+ * (which needs to be on the Java Classpath).
*/
- public static void main (String[] args) throws Exception {
+ public static void main (String[] args) {
String text = null;
- if (args.length > 1) {
- if ("transcribe-translate".equals(args[1])) {
- text =
googleTranslateToEnglish(amazonTranscribe(Paths.get(args[0]),
- Paths.get(args[1])));
- System.out.print("Transcription and translation
successful!\nEXTRACTED TEXT: " + text);
- } else if ("transcribe".equals(args[1])) {
- text = amazonTranscribe(Paths.get(args[0]),
Paths.get(args[1]));
- System.out.print("Transcription successful!\nEXTRACTED TEXT: "
+ text);
+ if (args.length != 0) {
+ if ("transcribe-translate".equals(args[0])) {
+ text = googleTranslateToEnglish(amazonTranscribe(args[1]));
+ System.out.print("Transcription and translation
successful!\nEXTRAXCTED TEXT: " + text);
+ } else if ("transcribe".equals(args[0])) {
+ text = amazonTranscribe(args[1]);
+ System.out.print("Transcription successful!\nEXTRAXCTED TEXT:
" + text);
} else {
System.out.print("Incorrect invocation, see Javadoc.");
}
diff --git a/tika-parsers/tika-parsers-ml/pom.xml
b/tika-parsers/tika-parsers-ml/pom.xml
index 2dcde9e..ba9bd38 100644
--- a/tika-parsers/tika-parsers-ml/pom.xml
+++ b/tika-parsers/tika-parsers-ml/pom.xml
@@ -40,7 +40,6 @@
<module>tika-age-recogniser</module>
<module>tika-parser-advancedmedia-module</module>
<module>tika-dl</module>
- <module>tika-transcribe-aws</module>
</modules>
<build>
diff --git a/tika-transcribe/pom.xml b/tika-transcribe/pom.xml
new file mode 100644
index 0000000..aadb137
--- /dev/null
+++ b/tika-transcribe/pom.xml
@@ -0,0 +1,159 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parent</artifactId>
+ <version>2.0.0-SNAPSHOT</version>
+ <relativePath>../tika-parent/pom.xml</relativePath>
+ </parent>
+
+ <artifactId>tika-transcribe</artifactId>
+ <packaging>bundle</packaging>
+ <name>Apache Tika transcribe</name>
+ <url>http://tika.apache.org/</url>
+ <!--TODO use latest aws version or the one defined in the tika-parent-->
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.amazonaws</groupId>
+ <artifactId>aws-java-sdk-transcribe</artifactId>
+ <version>${aws.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>com.amazonaws</groupId>
+ <artifactId>aws-java-sdk-s3</artifactId>
+ <version>${aws.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.googlecode.json-simple</groupId>
+ <artifactId>json-simple</artifactId>
+ <version>${json.simple.version}</version>
+ </dependency>
+ <!-- Test dependencies -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <version>${maven.bundle.version}</version>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Bundle-DocURL>${project.url}</Bundle-DocURL>
+ <Bundle-Activator>
+ org.apache.tika.parser.internal.Activator
+ </Bundle-Activator>
+ <Import-Package>
+ org.w3c.dom,
+ org.apache.tika.*,
+ *;resolution:=optional
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <version>${rat.version}</version>
+ <configuration>
+ <excludes>
+
<exclude>src/main/java/org/apache/tika/parser/txt/Charset*.java</exclude>
+ <exclude>src/test/resources/test-documents/**</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+
<Automatic-Module-Name>org.apache.tika.translate</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+
+ <pluginManagement>
+ <plugins>
+ <!-- This plugin's configuration is used to store Eclipse m2e
-->
+ <!-- settings only. It has no influence on the Maven build
itself. -->
+ <plugin>
+ <groupId>org.eclipse.m2e</groupId>
+ <artifactId>lifecycle-mapping</artifactId>
+ <version>1.0.0</version>
+ <configuration>
+ <lifecycleMappingMetadata>
+ <pluginExecutions>
+ <pluginExecution>
+ <pluginExecutionFilter>
+ <groupId>org.apache.felix</groupId>
+
<artifactId>maven-scr-plugin</artifactId>
+ <version>${maven.scr.version}</version>
+ <goals>
+ <goal>scr</goal>
+ </goals>
+ </pluginExecutionFilter>
+ <action>
+ <execute/>
+ </action>
+ </pluginExecution>
+ </pluginExecutions>
+ </lifecycleMappingMetadata>
+ </configuration>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
+</project>
\ No newline at end of file
diff --git
a/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
b/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
new file mode 100644
index 0000000..5b50491
--- /dev/null
+++
b/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java
@@ -0,0 +1,406 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.transcribe;
+
+import com.amazonaws.AmazonServiceException;
+import com.amazonaws.SdkClientException;
+import com.amazonaws.auth.AWSStaticCredentialsProvider;
+import com.amazonaws.auth.BasicAWSCredentials;
+import com.amazonaws.services.s3.AmazonS3;
+import com.amazonaws.services.s3.AmazonS3ClientBuilder;
+import com.amazonaws.services.s3.model.AmazonS3Exception;
+import com.amazonaws.services.s3.model.CompressionType;
+import com.amazonaws.services.s3.model.ExpressionType;
+import com.amazonaws.services.s3.model.InputSerialization;
+import com.amazonaws.services.s3.model.JSONInput;
+import com.amazonaws.services.s3.model.JSONOutput;
+import com.amazonaws.services.s3.model.JSONType;
+import com.amazonaws.services.s3.model.OutputSerialization;
+import com.amazonaws.services.s3.model.PutObjectRequest;
+import com.amazonaws.services.s3.model.PutObjectResult;
+import com.amazonaws.services.s3.model.SelectObjectContentEvent;
+import com.amazonaws.services.s3.model.SelectObjectContentEventVisitor;
+import com.amazonaws.services.s3.model.SelectObjectContentRequest;
+import com.amazonaws.services.s3.model.SelectObjectContentResult;
+import com.amazonaws.services.transcribe.AmazonTranscribeAsync;
+import com.amazonaws.services.transcribe.AmazonTranscribeAsyncClientBuilder;
+import com.amazonaws.services.transcribe.model.Media;
+import com.amazonaws.services.transcribe.model.StartTranscriptionJobRequest;
+import com.amazonaws.services.transcribe.model.TranscriptionJob;
+import com.amazonaws.services.transcribe.model.TranscriptionJobStatus;
+import com.amazonaws.services.transcribe.model.GetTranscriptionJobRequest;
+import com.amazonaws.services.transcribe.model.GetTranscriptionJobResult;
+import com.amazonaws.services.transcribe.model.LanguageCode;
+import org.apache.tika.exception.TikaException;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.Properties;
+import java.util.UUID;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.stream.Collectors;
+
+/**
+ * <a href="https://aws.amazon.com/transcribe/">Amazon Transcribe</a>
+ * {@link Transcriber} implementation. See Javadoc for configiration options.
+ *
+ * @since Tika 2.1
+ */
+public class AmazonTranscribe implements Transcriber {
+
+ public static final String PROPERTIES_FILE =
"transcribe.amazon.properties";
+ public static final String ID_PROPERTY = "transcribe.AWS_ACCESS_KEY";
+ public static final String SECRET_PROPERTY = "transcribe.AWS_SECRET_KEY";
+ public static final String DEFAULT_ID = "dummy-id";
+ public static final String DEFAULT_SECRET = "dummy-secret";
+ public static final String DEFAULT_BUCKET = "dummy-bucket";
+ public static final String BUCKET_NAME = "transcribe.BUCKET_NAME";
+ public static final String REGION = "transcribe.REGION";
+ private static final Logger LOG = LoggerFactory
+ .getLogger(AmazonTranscribe.class);
+ private AmazonTranscribeAsync amazonTranscribeAsync;
+ private AmazonS3 amazonS3;
+ private String bucketName;
+ private String region;
+ private boolean isAvailable; // Flag for whether or not transcription is
+ // available.
+ private String clientId;
+ private String clientSecret; // Keys used for the API calls.
+ private AWSStaticCredentialsProvider credsProvider;
+
+ /**
+ * Create a new AmazonTranscribe instance with the client keys specified in
+ * <code>transcribe.amazon.properties</code> which needs to be available on
+ * the Java Classpath.
+ * Silently becomes unavailable when client keys are unavailable.
+ * <code>transcribe.AWS_ACCESS_KEY</code>,
+ * <code>transcribe.AWS_SECRET_KEY</code>,
+ * <code>transcribe.BUCKET_NAME</code> and
+ * <code>transcribe.REGION</code> must be set in
+ * <code>transcribe.amazon.properties</code>.
+ * <b>N.B.</b> it is not necessary to create the bucket before hand.
+ * This implementation will automatically create the bucket if one
+ * does not alrerady exist, per the name defined above.
+ *
+ * @since Tika 2.0
+ */
+ public AmazonTranscribe() {
+ Properties config = new Properties();
+ try {
+ config.load(AmazonTranscribe.class
+ .getResourceAsStream(PROPERTIES_FILE));
+ this.clientId = config.getProperty(ID_PROPERTY);
+ this.clientSecret = config.getProperty(SECRET_PROPERTY);
+ this.bucketName = config.getProperty(BUCKET_NAME);
+ this.region = config.getProperty(REGION);
+ BasicAWSCredentials creds = new BasicAWSCredentials(this.clientId,
+ this.clientSecret);
+ this.credsProvider = new AWSStaticCredentialsProvider(creds);
+ amazonS3 = AmazonS3ClientBuilder.standard()
+ .withCredentials(credsProvider).withRegion(this.region)
+ .build();
+ this.isAvailable = checkAvailable();
+ if (!this.amazonS3.doesBucketExistV2(this.bucketName)) {
+ try {
+ amazonS3.createBucket(this.bucketName);
+ } catch (AmazonS3Exception e) {
+ throw new RuntimeException(e.getErrorMessage());
+ }
+ }
+ this.amazonTranscribeAsync = AmazonTranscribeAsyncClientBuilder
+ .standard().withCredentials(credsProvider)
+ .withRegion(this.region).build();
+ } catch (Exception e) {
+ LOG.warn("Exception reading config file", e);
+ isAvailable = false;
+ }
+ }
+
+ /**
+ * private method to get a unique job key.
+ *
+ * @return unique job key.
+ */
+ private String getJobKey() {
+ return UUID.randomUUID().toString();
+ }
+
+ /**
+ * Constructs a new {@link PutObjectRequest} object to upload a file to the
+ * specified bucket and jobName. After constructing the request, users may
+ * optionally specify object metadata or a canned ACL as well.
+ *
+ * @param inputStream, null
+ * The file to upload to Amazon S3.
+ * @param jobName
+ * The unique job name for each job(UUID).
+ */
+ private void uploadFileToBucket(InputStream inputStream, String jobName)
+ throws TikaException {
+ PutObjectRequest request = new PutObjectRequest(this.bucketName,
+ jobName, inputStream, null);
+ try {
+ @SuppressWarnings("unused")
+ PutObjectResult response = amazonS3.putObject(request);
+ } catch (SdkClientException e) {
+ throw (new TikaException("File Upload to AWS Failed"));
+ }
+ }
+
+ /**
+ * Starts AWS Transcribe Job without language specification.
+ *
+ * @param inputStream
+ * the source input stream.
+ * @return The transcribed string result, NULL if the job failed.
+ * @throws TikaException
+ * When there is an error transcribing.
+ * @throws IOException
+ * If an I/O exception of some sort has occurred.
+ */
+ @Override
+ public String transcribe(InputStream inputStream)
+ throws TikaException, IOException {
+ if (!isAvailable())
+ return null;
+ String jobName = getJobKey();
+ uploadFileToBucket(inputStream, jobName);
+ StartTranscriptionJobRequest startTranscriptionJobRequest = new
StartTranscriptionJobRequest();
+ Media media = new Media();
+ media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString());
+
startTranscriptionJobRequest.withIdentifyLanguage(true).withMedia(media)
+ .withOutputBucketName(this.bucketName)
+ .withTranscriptionJobName(jobName)
+ .setRequestCredentialsProvider(credsProvider);
+ amazonTranscribeAsync
+ .startTranscriptionJob(startTranscriptionJobRequest);
+ return getTranscriptText(jobName);
+ }
+
+ /**
+ * Starts AWS Transcribe Job with language specification.
+ *
+ * @param inputStream
+ * the source input stream.
+ * @param sourceLanguage
+ * <a href=
+ *
"https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS
+ * Language Code</a> for the language used in the input media
+ * file.
+ * @return The transcribed string result, NULL if the job failed.
+ * @throws TikaException
+ * When there is an error transcribing.
+ * @throws IOException
+ * If an I/O exception of some sort has occurred.
+ * @see <a href=
+ *
"https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS
+ * Language Code</a>
+ */
+ @Override
+ public String transcribe(InputStream inputStream, String sourceLanguage)
+ throws TikaException, IOException {
+ if (!isAvailable())
+ return null;
+ String jobName = getJobKey();
+ uploadFileToBucket(inputStream, jobName);
+ StartTranscriptionJobRequest startTranscriptionJobRequest = new
StartTranscriptionJobRequest();
+ Media media = new Media();
+ media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString());
+ ((StartTranscriptionJobRequest) startTranscriptionJobRequest
+ .withMedia(media).withOutputBucketName(this.bucketName)
+ .withTranscriptionJobName(jobName)
+ .withRequestCredentialsProvider(credsProvider))
+ .withLanguageCode(
+ LanguageCode.fromValue(sourceLanguage));
+ amazonTranscribeAsync
+ .startTranscriptionJob(startTranscriptionJobRequest);
+ return getTranscriptText(jobName);
+ }
+
+ /**
+ * @return true if this Transcriber is probably able to transcribe right
+ * now.
+ * @since Tika 2.1
+ */
+ @Override
+ public boolean isAvailable() {
+ return this.isAvailable;
+ }
+
+ /**
+ * Sets the client Id for the transcriber API.
+ *
+ * @param id
+ * The ID to set.
+ */
+ public void setId(String id) {
+ this.clientId = id;
+ this.isAvailable = checkAvailable();
+ }
+
+ /**
+ * Sets the client secret for the transcriber API.
+ *
+ * @param secret
+ * The secret to set.
+ */
+ public void setSecret(String secret) {
+ this.clientSecret = secret;
+ this.isAvailable = checkAvailable();
+ }
+
+ /**
+ * Sets the client secret for the transcriber API.
+ *
+ * @param bucket
+ * The bucket to set.
+ */
+ public void setBucket(String bucket) {
+ this.bucketName = bucket;
+ this.isAvailable = checkAvailable();
+ }
+
+ /**
+ * Private method check if the service is available.
+ *
+ * @return if the service is available
+ */
+ private boolean checkAvailable() {
+ return clientId != null && !clientId.equals(DEFAULT_ID)
+ && clientSecret != null && !clientSecret.equals(DEFAULT_SECRET)
+ && bucketName != null && !bucketName.equals(DEFAULT_BUCKET);
+ }
+
+ /**
+ * Gets Transcription result from AWS S3 bucket given the jobName.
+ *
+ * @param fileNameS3
+ * The path of the file to upload to Amazon S3.
+ * @return The transcribed string result, NULL if the job failed.
+ * @throws IOException possible reasons include (i) an End Event is not
received
+ * from AWS S3 SelectObjectContentResult operation and (ii) a parse
exception
+ * whilst processing JSON from the AWS S3 SelectObjectContentResult
operation.
+ * @throws SdkClientException a AWS-specific exception related to
SelectObjectContentResult
+ * operation.
+ * @throws AmazonServiceException possibly thrown if there is an issue
selecting object content
+ * from AWS S3 objects.
+ */
+ private String getTranscriptText(String fileNameS3) throws
AmazonServiceException, SdkClientException, IOException {
+ TranscriptionJob transcriptionJob = retrieveObjectWhenJobCompleted(
+ fileNameS3);
+ String text = null;
+ if (transcriptionJob != null && !TranscriptionJobStatus.FAILED.name()
+ .equals(transcriptionJob.getTranscriptionJobStatus())) {
+ InputSerialization inputSerialization = new
InputSerialization().withJson(new JSONInput().withType(JSONType.DOCUMENT))
+ .withCompressionType(CompressionType.NONE);
+ OutputSerialization outputSerialization = new
OutputSerialization().withJson(new JSONOutput());
+ SelectObjectContentRequest request = new
SelectObjectContentRequest()
+ .withBucketName(this.bucketName).withKey(fileNameS3 +
".json")
+ .withExpression("Select
s.results.transcripts[0].transcript from S3Object s")//WHERE transcript IS NOT
MISSING
+
.withExpressionType(ExpressionType.SQL).withRequestCredentialsProvider(credsProvider);
+ request.setInputSerialization(inputSerialization);
+ request.setOutputSerialization(outputSerialization);
+
+ final AtomicBoolean isResultComplete = new AtomicBoolean(false);
+
+ try (SelectObjectContentResult result = amazonS3
+ .selectObjectContent(request)) {
+ InputStream resultInputStream = result.getPayload()
+ .getRecordsInputStream(
+ new SelectObjectContentEventVisitor() {
+ @Override
+ public void visit(
+
SelectObjectContentEvent.StatsEvent event) {
+ LOG.debug(
+ "Received Stats, Bytes
Scanned: "
+ + event.getDetails()
+ .getBytesScanned()
+ + " Bytes Processed: "
+ + event.getDetails()
+ .getBytesProcessed());
+ }
+
+ /*
+ * An End Event informs that the request
has
+ * finished successfully.
+ */
+ @Override
+ public void visit(
+ SelectObjectContentEvent.EndEvent
event) {
+ isResultComplete.set(true);
+ LOG.debug(
+ "Received End Event. Result is
complete.");
+ }
+ });
+ text = new BufferedReader(
+ new InputStreamReader(resultInputStream,
StandardCharsets.UTF_8))
+ .lines()
+ .collect(Collectors.joining("\n"));
+ }
+ /*
+ * The End Event indicates all matching records have been
+ * transmitted. If the End Event is not received, the results
+ * may be incomplete.
+ */
+ if (!isResultComplete.get()) {
+ throw new IOException(
+ "S3 Select request was incomplete as End Event was not
received.");
+ }
+ }
+ JSONParser parser = new JSONParser();
+ JSONObject obj = null;
+ try {
+ obj = (JSONObject) parser.parse(text);
+ } catch (ParseException e) {
+ throw new IOException(e.getMessage(), e);
+ }
+ return obj.get("transcript").toString();
+ }
+
+ /**
+ * Private helper function to get object from s3.
+ *
+ * @param jobName
+ * The unique job name for each job(UUID).
+ * @return TranscriptionJob object
+ */
+ private TranscriptionJob retrieveObjectWhenJobCompleted(String jobName) {
+ GetTranscriptionJobRequest getTranscriptionJobRequest = new
GetTranscriptionJobRequest();
+ getTranscriptionJobRequest
+ .withRequestCredentialsProvider(credsProvider);
+ getTranscriptionJobRequest.setTranscriptionJobName(jobName);
+ while (true) {
+ GetTranscriptionJobResult innerResult = amazonTranscribeAsync
+ .getTranscriptionJob(getTranscriptionJobRequest);
+ String status = innerResult.getTranscriptionJob()
+ .getTranscriptionJobStatus();
+ if (TranscriptionJobStatus.COMPLETED.name().equals(status)
+ || TranscriptionJobStatus.FAILED.name().equals(status)) {
+ return innerResult.getTranscriptionJob();
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git
a/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator
b/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator
new file mode 100644
index 0000000..1256ab6
--- /dev/null
+++
b/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.tika.language.translate.amazontranscribe
diff --git
a/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties
b/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties
new file mode 100644
index 0000000..043a66f
--- /dev/null
+++
b/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+transcribe.AWS_ACCESS_KEY=dummy_key
+transcribe.AWS_SECRET_KEY=dummy_key
+transcribe.BUCKET_NAME=dummy_name
diff --git
a/tika-transcribe/src/test/java/org/apache/tika/transcribe/AmazonTranscribeTest.java
b/tika-transcribe/src/test/java/org/apache/tika/transcribe/AmazonTranscribeTest.java
new file mode 100644
index 0000000..3b424f9
--- /dev/null
+++
b/tika-transcribe/src/test/java/org/apache/tika/transcribe/AmazonTranscribeTest.java
@@ -0,0 +1,527 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.transcribe;
+
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.FileInputStream;
+
+import static junit.framework.TestCase.assertNotNull;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+//TODO: Check the ACTUAL output of Amazon Transcribe.
+
+/**
+ * Tests tika-trancribe by creating an AmazonTranscribe() object.
+ * 1) Tests that transcribe functions properly when it is given just a
filepath.
+ * 2) Both audio (mp3) and video (mp4) files are used in these tests.
+ */
+@Ignore("Ignore until finalize AmazonTransribe Interface & build Tika")
+public class AmazonTranscribeTest {
+ AmazonTranscribe transcriber;
+
+ @Before
+ public void setUp() {
+ transcriber = new AmazonTranscribe();
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is en-US (English - United States)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_enUS() {
+ String audioFilePath =
"src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3";
+ String expected = "a little bottle of water.";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath), "en-US");
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is en-US (English - United States)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_enUS() {
+ String audioFilePath =
"src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3";
+ String expected = "a little bottle of water.";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath));
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is en-US (English - United States)
+ */
+ @Test
+ public void testAmazonTranscribeVideo_enUS() {
+ String videoFilePath = "en-US_(Hi).mp4";
+ String expected = "Hi";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(videoFilePath), "en-US");
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with a video file without passing in the source
language.
+ * The source language of the file is en-US (English - United States)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownVideo_enUS() {
+ String videoFilePath = "en-US_(Hi).mp4";
+ String expected = "Hi";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(videoFilePath));
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is en-GB (English - Great Britain)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_enGB() {
+ String audioFilePath =
"src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3";
+ String expected = "a little bottle of water.";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath), "en-GB");
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is en-GB (English - Great Britain)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_enGB() {
+ String audioFilePath =
"src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3";
+ String expected = "a little bottle of water.";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath));
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is en-AU (English - Australia)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_enAU() {
+ String source =
"src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3";
+ String expected = "a little bottle of water.";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new FileInputStream(source),
"en-AU");
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is en-AU (English - Australian)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_enAU() {
+ String videoFilePath =
"src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3";
+ String expected = "a little bottle of water.";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(videoFilePath));
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is de-DE (German)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_deDE() {
+ String audioFilePath =
"src/test/resources/de-DE_(We_Are_At_School_x2).mp3";
+ String expected = "Wir sind in der Schule. Wir sind in der Schule.";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath), "de-DE");
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is de-DE (German)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_deDE() {
+ String audioFilePath =
"src/test/resources/de-DE_(We_Are_At_School_x2).mp3";
+ String expected = "Wir sind in der Schule. Wir sind in der Schule.";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath));
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is it-IT (Italian)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_itIT() {
+ String audioFilePath =
"src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3";
+ String expected = "stiamo facendo lezione. stiamo facendo lezione.";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath), "it-IT");
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is it-IT (Italian)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_itIT() {
+ String audioFilePath =
"src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3";
+ String expected = "stiamo facendo lezione. stiamo facendo lezione.";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath));
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is ja-JP (Japanese)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_jaJP() {
+ String audioFilePath =
"src/test/resources/ja-JP_(We_Are_At_School).mp3";
+ String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath), "ja-JP");
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is ja-JP (Japanese)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_jaJP() {
+ String audioFilePath =
"src/test/resources/ja-JP_(We_Are_At_School).mp3";
+ String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath));
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is ko-KR (Korean)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_koKR() {
+ String audioFilePath =
"src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3";
+ String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath), "ko-KR");
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is ko-KR (Korean)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_koKR() {
+ String audioFilePath =
"src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3";
+ String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath));
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with a video file given the source language
+ * The source language of the file is ko-KR (Korean)
+ */
+ @Test
+ public void testAmazonTranscribeVideo_koKR() {
+ String source = "src/test/resources/ko-KR_(Annyeonghaseyo).mp4";
+ //TODO: Check whether output is Annyeonghaseyo or 안녕하세요
+ String expected = "Annyeonghaseyo";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new FileInputStream(source),
"ko-KR");
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an video file without passing in the source
language.
+ * The source language of the file is ko-KR (Korean)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownVideo_koKR() {
+ String source = "src/test/resources/ko-KR_(Annyeonghaseyo).mp4";
+ //TODO: Check whether output is Annyeonghaseyo or 안녕하세요
+ String expected = "Annyeonghaseyo";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new FileInputStream(source));
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file given the source language
+ * The source language of the file is pt-BR (Portuguese - Brazil)
+ */
+ @Test
+ public void testAmazonTranscribeAudio_ptBR() {
+ String audioFilePath =
"src/test/resources/pt-BR_(We_Are_At_School).mp3";
+ String expected = "nós estamos na escola.";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath), "pt-BR");
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Tests transcribe with an audio file without passing in the source
language.
+ * The source language of the file is pt-BR (Portuguese - Brazil)
+ */
+ @Test
+ public void testAmazonTranscribeUnknownAudio_ptBR() {
+ String audioFilePath =
"src/test/resources/pt-BR_(We_Are_At_School).mp3";
+ String expected = "nós estamos na escola.";
+ String result;
+
+ if (transcriber.isAvailable()) {
+ try {
+ result = transcriber.transcribe(new
FileInputStream(audioFilePath));
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+ }
+
+}
diff --git a/tika-transcribe/src/test/resources/ShortAudioSampleFrench.mp3
b/tika-transcribe/src/test/resources/ShortAudioSampleFrench.mp3
new file mode 100644
index 0000000..a718047
Binary files /dev/null and
b/tika-transcribe/src/test/resources/ShortAudioSampleFrench.mp3 differ
diff --git a/tika-transcribe/src/test/resources/de-DE_(We_Are_At_School_x2).mp3
b/tika-transcribe/src/test/resources/de-DE_(We_Are_At_School_x2).mp3
new file mode 100644
index 0000000..9d4df04
Binary files /dev/null and
b/tika-transcribe/src/test/resources/de-DE_(We_Are_At_School_x2).mp3 differ
diff --git
a/tika-transcribe/src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3
b/tika-transcribe/src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3
new file mode 100644
index 0000000..16f840d
Binary files /dev/null and
b/tika-transcribe/src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3 differ
diff --git
a/tika-transcribe/src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3
b/tika-transcribe/src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3
new file mode 100644
index 0000000..2c6ae35
Binary files /dev/null and
b/tika-transcribe/src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3 differ
diff --git
a/tika-transcribe/src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3
b/tika-transcribe/src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3
new file mode 100644
index 0000000..3d69b68
Binary files /dev/null and
b/tika-transcribe/src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3 differ
diff --git a/tika-transcribe/src/test/resources/en-US_(Hi).mp4
b/tika-transcribe/src/test/resources/en-US_(Hi).mp4
new file mode 100644
index 0000000..d697b13
Binary files /dev/null and b/tika-transcribe/src/test/resources/en-US_(Hi).mp4
differ
diff --git
a/tika-transcribe/src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3
b/tika-transcribe/src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3
new file mode 100644
index 0000000..5fa69c3
Binary files /dev/null and
b/tika-transcribe/src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3 differ
diff --git a/tika-transcribe/src/test/resources/ja-JP_(We_Are_At_School).mp3
b/tika-transcribe/src/test/resources/ja-JP_(We_Are_At_School).mp3
new file mode 100644
index 0000000..5ddf6e5
Binary files /dev/null and
b/tika-transcribe/src/test/resources/ja-JP_(We_Are_At_School).mp3 differ
diff --git a/tika-transcribe/src/test/resources/ko-KR_(Annyeonghaseyo).mp4
b/tika-transcribe/src/test/resources/ko-KR_(Annyeonghaseyo).mp4
new file mode 100644
index 0000000..d757d42
Binary files /dev/null and
b/tika-transcribe/src/test/resources/ko-KR_(Annyeonghaseyo).mp4 differ
diff --git
a/tika-transcribe/src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3
b/tika-transcribe/src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3
new file mode 100644
index 0000000..444098c
Binary files /dev/null and
b/tika-transcribe/src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3 differ
diff --git a/tika-transcribe/src/test/resources/pt-BR_(We_Are_At_School).mp3
b/tika-transcribe/src/test/resources/pt-BR_(We_Are_At_School).mp3
new file mode 100644
index 0000000..7dfc811
Binary files /dev/null and
b/tika-transcribe/src/test/resources/pt-BR_(We_Are_At_School).mp3 differ