This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4be6a5008afc02293453b68fc8149e1b7573be5f
Author: tallison <[email protected]>
AuthorDate: Tue May 18 09:25:48 2021 -0400

    TIKA-3384 -- convert transcribe to a traditional parser, step 2; make 
changes
---
 .../org/apache/tika/transcribe/Transcriber.java    |  60 ---
 .../tika/example/TranscribeTranslateExample.java   |  83 ++--
 .../tika-parsers-ml/tika-transcribe-aws/pom.xml    |  44 +-
 .../parser/transcribe/aws/AmazonTranscribe.java    | 460 ++++++++++-----------
 .../transcribe/aws/AmazonTranscribeTest.java       | 447 ++++++--------------
 .../test/resources/tika-config-aws-transcribe.xml  |  29 ++
 .../org.apache.tika.language.translate.Translator  |  16 -
 .../transcribe.amazon.properties                   |  18 -
 8 files changed, 458 insertions(+), 699 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java 
b/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java
deleted file mode 100644
index 3546256..0000000
--- a/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.transcribe;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-
-/**
- * Interface for Transcriber services.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-94";>TIKA-94</a>
- * @since Tika 2.1
- */
-public interface Transcriber {
-    /**
-     * Transcribe the given file.
-     *
-     * @param inputStream the source input stream.
-     * @return The transcribed string result, NULL if the job failed.
-     * @throws TikaException When there is an error transcribing.
-     * @throws IOException   If an I/O exception of some sort has occurred.
-     * @since 2.1
-     */
-    public String transcribe(InputStream inputStream) throws TikaException, 
IOException;
-
-    /**
-     * Transcribe the given the file and the source language.
-     *
-     * @param inputStream    the source input stream.
-     * @param sourceLanguage The language code for the language used in the 
input media file.
-     * @return The transcribed string result, NULL if the job failed.
-     * @throws TikaException When there is an error transcribing.
-     * @throws IOException   If an I/O exception of some sort has occurred.
-     * @since 2.1
-     */
-    public String transcribe(InputStream inputStream, String sourceLanguage) 
throws TikaException, IOException;
-
-    /**
-     * @return true if this Transcriber is probably able to transcribe right 
now.
-     * @since Tika 2.1
-     */
-    public boolean isAvailable();
-}
diff --git 
a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
 
b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
index 12dd7e5..a90d322 100644
--- 
a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
+++ 
b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java
@@ -17,22 +17,23 @@
 
 package org.apache.tika.example;
 
-import java.io.FileInputStream;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.language.translate.GoogleTranslator;
 import org.apache.tika.language.translate.Translator;
-import org.apache.tika.transcribe.AmazonTranscribe;
-import org.apache.tika.transcribe.Transcriber;
 
 /**
  * This example demonstrates primitive logic for
  * chaining Tika API calls. In this case translation
- * could be considered as a downstream process to 
+ * could be considered as a downstream process to
  * transcription.
  * We simply pass the output of
- * a call to {@link Transcriber#transcribe(java.io.InputStream)}
- * into {@link Translator#translate(String, String)}. 
- * The {@link GoogleTranslator} is configured with a target 
+ * a call to {@link Tika#parseToString(Path)}
+ * into {@link Translator#translate(String, String)}.
+ * The {@link GoogleTranslator} is configured with a target
  * language of "en-US".
  * @author lewismc
  *
@@ -42,7 +43,7 @@ public class TranscribeTranslateExample {
     /**
      * Use {@link GoogleTranslator} to execute translation on
      * input data. This implementation needs configured as explained in the 
Javadoc.
-     * In this implementation, Google will try to guess the input language. 
The target 
+     * In this implementation, Google will try to guess the input language. 
The target
      * language is "en-US".
      * @param text input text to translate.
      * @return translated text String.
@@ -61,43 +62,55 @@ public class TranscribeTranslateExample {
     }
 
     /**
-     * Use {@link AmazonTranscribe} to execute transcription on input data.
-     * This implementation needs configured as explained in the Javadoc.
+     * Use {@link org.apache.tika.parser.transcribe.aws.AmazonTranscribe} to 
execute transcription
+     * on input data.
+     * This implementation needs to be configured as explained in the Javadoc.
      * @param file the name of the file (which needs to be on the Java 
Classpath) to transcribe.
      * @return transcribed text.
      */
-    public static String amazonTranscribe(String file) {
-        String filePath = 
TranscribeTranslateExample.class.getClassLoader().getResource(file).getPath();
-        String result = null;
-        Transcriber transcriber = new AmazonTranscribe();
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new FileInputStream(filePath));
-            } catch (Exception e) {
-                e.printStackTrace();
-            }
-        }
-        return result;
+    public static String amazonTranscribe(Path tikaConfig, Path file) throws 
Exception {
+        return new Tika(new TikaConfig(tikaConfig)).parseToString(file);
     }
 
     /**
      * Main method to run this example. This program can be invoked as follows
      * <ol>
-     * <li><code>transcribe-translate ${file}</code>; which executes both 
-     * transcription then translation on the given resource, or 
-     * <li><code>transcribe ${file}</code>; which executes only 
translation</li>
-     * @param args either of the commands described above and the input file 
-     * (which needs to be on the Java Classpath). 
+     * <li><code>transcribe-translate ${tika-config.xml} ${file}</code>; which 
executes both
+     * transcription then translation on the given resource, or
+     * <li><code>transcribe ${tika-config.xml} ${file}</code>; which executes 
only translation</li>
+     * @param args either of the commands described above and the input file
+     * (which needs to be on the Java Classpath).
+     *
+     *
+     *
+     * ${tika-config.xml} must include credentials for aws and a temporary 
storage bucket:
+     * <pre>
+     * {@code
+     *  <properties>
+     *   <parsers>
+     *     <parser class="org.apache.tika.parser.DefaultParser"/>
+     *     <parser 
class="org.apache.tika.parser.transcribe.aws.AmazonTranscribe">
+     *       <params>
+     *         <param name="bucket" type="string">bucket</param>
+     *         <param name="clientId" type="string">clientId</param>
+     *         <param name="clientSecret" type="string">clientSecret</param>
+     *       </params>
+     *     </parser>
+     *   </parsers>
+     * </properties>
+     * }
+     * </pre>
      */
-    public static void main (String[] args) {
+    public static void main (String[] args) throws Exception {
         String text = null;
-        if (args.length != 0) {
-            if ("transcribe-translate".equals(args[0])) {
-                text = googleTranslateToEnglish(amazonTranscribe(args[1]));
-                System.out.print("Transcription and translation 
successful!\nEXTRAXCTED TEXT: " + text);
-            } else if ("transcribe".equals(args[0])) {
-                text = amazonTranscribe(args[1]);
-                System.out.print("Transcription successful!\nEXTRAXCTED TEXT: 
" + text);
+        if (args.length > 1) {
+            if ("transcribe-translate".equals(args[1])) {
+                text = 
googleTranslateToEnglish(amazonTranscribe(Paths.get(args[0]),
+                        Paths.get(args[1])));
+                System.out.print("Transcription and translation 
successful!\nEXTRACTED TEXT: " + text);
+            } else if ("transcribe".equals(args[1])) {
+                text = amazonTranscribe(Paths.get(args[0]), 
Paths.get(args[1]));
+                System.out.print("Transcription successful!\nEXTRACTED TEXT: " 
+ text);
             } else {
                 System.out.print("Incorrect invocation, see Javadoc.");
             }
diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml 
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml
index 2170f8c..1e287c5 100644
--- a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml
+++ b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml
@@ -25,19 +25,19 @@
     <modelVersion>4.0.0</modelVersion>
 
     <parent>
-        <groupId>org.apache.tika</groupId>
         <artifactId>tika-parsers-ml</artifactId>
+        <groupId>org.apache.tika</groupId>
         <version>2.0.0-SNAPSHOT</version>
     </parent>
 
     <artifactId>tika-transcribe-aws</artifactId>
     <packaging>bundle</packaging>
-    <name>Apache Tika transcribe</name>
+    <name>Apache Tika transcribe aws</name>
     <url>http://tika.apache.org/</url>
     <!--TODO use latest aws version or the one defined in the tika-parent-->
     <dependencies>
         <dependency>
-            <groupId>org.apache.tika</groupId>
+            <groupId>${project.groupId}</groupId>
             <artifactId>tika-core</artifactId>
             <version>${project.version}</version>
         </dependency>
@@ -54,9 +54,37 @@
                     <groupId>commons-codec</groupId>
                     <artifactId>commons-codec</artifactId>
                 </exclusion>
+                <exclusion>
+                    <groupId>com.fasterxml.jackson.core</groupId>
+                    <artifactId>jackson-core</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>com.fasterxml.jackson.core</groupId>
+                    <artifactId>jackson-databind</artifactId>
+                </exclusion>
             </exclusions>
         </dependency>
         <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-core</artifactId>
+            <version>${jackson.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+            <version>${jackson.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>commons-logging</groupId>
+            <artifactId>commons-logging</artifactId>
+            <version>${commons.logging.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>commons-codec</groupId>
+            <artifactId>commons-codec</artifactId>
+            <version>${commons.codec.version}</version>
+        </dependency>
+        <dependency>
             <groupId>com.amazonaws</groupId>
             <artifactId>aws-java-sdk-s3</artifactId>
             <version>${aws.version}</version>
@@ -70,6 +98,14 @@
         <dependency>
             <groupId>junit</groupId>
             <artifactId>junit</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>${project.version}</version>
+            <scope>test</scope>
+            <type>test-jar</type>
         </dependency>
     </dependencies>
     <build>
@@ -110,7 +146,7 @@
                 <configuration>
                     <archive>
                         <manifestEntries>
-                            
<Automatic-Module-Name>org.apache.tika.translate</Automatic-Module-Name>
+                            
<Automatic-Module-Name>org.apache.tika.parser.transcribe.aws</Automatic-Module-Name>
                         </manifestEntries>
                     </archive>
                 </configuration>
diff --git 
a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java
 
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java
index 5b50491..91e8452 100644
--- 
a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java
+++ 
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java
@@ -15,7 +15,21 @@
  * limitations under the License.
  */
 
-package org.apache.tika.transcribe;
+package org.apache.tika.parser.transcribe.aws;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.stream.Collectors;
 
 import com.amazonaws.AmazonServiceException;
 import com.amazonaws.SdkClientException;
@@ -39,48 +53,48 @@ import 
com.amazonaws.services.s3.model.SelectObjectContentRequest;
 import com.amazonaws.services.s3.model.SelectObjectContentResult;
 import com.amazonaws.services.transcribe.AmazonTranscribeAsync;
 import com.amazonaws.services.transcribe.AmazonTranscribeAsyncClientBuilder;
+import com.amazonaws.services.transcribe.model.GetTranscriptionJobRequest;
+import com.amazonaws.services.transcribe.model.GetTranscriptionJobResult;
+import com.amazonaws.services.transcribe.model.LanguageCode;
 import com.amazonaws.services.transcribe.model.Media;
 import com.amazonaws.services.transcribe.model.StartTranscriptionJobRequest;
 import com.amazonaws.services.transcribe.model.TranscriptionJob;
 import com.amazonaws.services.transcribe.model.TranscriptionJobStatus;
-import com.amazonaws.services.transcribe.model.GetTranscriptionJobRequest;
-import com.amazonaws.services.transcribe.model.GetTranscriptionJobResult;
-import com.amazonaws.services.transcribe.model.LanguageCode;
-import org.apache.tika.exception.TikaException;
 import org.json.simple.JSONObject;
 import org.json.simple.parser.JSONParser;
 import org.json.simple.parser.ParseException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-import java.util.Properties;
-import java.util.UUID;
-import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.stream.Collectors;
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
 
 /**
- * <a href="https://aws.amazon.com/transcribe/";>Amazon Transcribe</a> 
- * {@link Transcriber} implementation. See Javadoc for configiration options.
+ * <a href="https://aws.amazon.com/transcribe/";>Amazon Transcribe</a>
+ * implementation. See Javadoc for configuration options.
+ * <p>
+ * Silently becomes unavailable when client keys are unavailable.
  *
- * @since Tika 2.1
+ * <b>N.B.</b> it is not necessary to create the bucket before hand.
+ * This implementation will automatically create the bucket if one
+ * does not already exist, per the name defined above.
+ *
+ * @since Tika 2.0
  */
-public class AmazonTranscribe implements Transcriber {
-
-    public static final String PROPERTIES_FILE = 
"transcribe.amazon.properties";
-    public static final String ID_PROPERTY = "transcribe.AWS_ACCESS_KEY";
-    public static final String SECRET_PROPERTY = "transcribe.AWS_SECRET_KEY";
-    public static final String DEFAULT_ID = "dummy-id";
-    public static final String DEFAULT_SECRET = "dummy-secret";
-    public static final String DEFAULT_BUCKET = "dummy-bucket";
-    public static final String BUCKET_NAME = "transcribe.BUCKET_NAME";
-    public static final String REGION = "transcribe.REGION";
-    private static final Logger LOG = LoggerFactory
-            .getLogger(AmazonTranscribe.class);
+
+public class AmazonTranscribe extends AbstractParser implements Initializable {
+    private static final Logger LOG = 
LoggerFactory.getLogger(AmazonTranscribe.class);
     private AmazonTranscribeAsync amazonTranscribeAsync;
     private AmazonS3 amazonS3;
     private String bucketName;
@@ -91,161 +105,74 @@ public class AmazonTranscribe implements Transcriber {
     private String clientSecret; // Keys used for the API calls.
     private AWSStaticCredentialsProvider credsProvider;
 
-    /**
-     * Create a new AmazonTranscribe instance with the client keys specified in
-     * <code>transcribe.amazon.properties</code> which needs to be available on
-     * the Java Classpath.
-     * Silently becomes unavailable when client keys are unavailable.
-     * <code>transcribe.AWS_ACCESS_KEY</code>,
-     * <code>transcribe.AWS_SECRET_KEY</code>,
-     * <code>transcribe.BUCKET_NAME</code> and 
-     * <code>transcribe.REGION</code> must be set in
-     * <code>transcribe.amazon.properties</code>.
-     * <b>N.B.</b> it is not necessary to create the bucket before hand. 
-     * This implementation will automatically create the bucket if one
-     * does not alrerady exist, per the name defined above.
-     *
-     * @since Tika 2.0
-     */
-    public AmazonTranscribe() {
-        Properties config = new Properties();
-        try {
-            config.load(AmazonTranscribe.class
-                    .getResourceAsStream(PROPERTIES_FILE));
-            this.clientId = config.getProperty(ID_PROPERTY);
-            this.clientSecret = config.getProperty(SECRET_PROPERTY);
-            this.bucketName = config.getProperty(BUCKET_NAME);
-            this.region = config.getProperty(REGION);
-            BasicAWSCredentials creds = new BasicAWSCredentials(this.clientId,
-                    this.clientSecret);
-            this.credsProvider = new AWSStaticCredentialsProvider(creds);
-            amazonS3 = AmazonS3ClientBuilder.standard()
-                    .withCredentials(credsProvider).withRegion(this.region)
-                    .build();
-            this.isAvailable = checkAvailable();
-            if (!this.amazonS3.doesBucketExistV2(this.bucketName)) {
-                try {
-                    amazonS3.createBucket(this.bucketName);
-                } catch (AmazonS3Exception e) {
-                    throw new RuntimeException(e.getErrorMessage());
-                }
-            }
-            this.amazonTranscribeAsync = AmazonTranscribeAsyncClientBuilder
-                    .standard().withCredentials(credsProvider)
-                    .withRegion(this.region).build();
-        } catch (Exception e) {
-            LOG.warn("Exception reading config file", e);
-            isAvailable = false;
-        }
-    }
-
-    /**
-     * private method to get a unique job key.
-     *
-     * @return unique job key.
-     */
-    private String getJobKey() {
-        return UUID.randomUUID().toString();
-    }
+    //https://docs.aws.amazon.com/transcribe/latest/dg/input.html
+    protected static final Set<MediaType> SUPPORTED_TYPES = 
Collections.unmodifiableSet(
+            new HashSet<>(Arrays.asList(MediaType.audio("x-flac"), 
MediaType.audio("mp3"),
+                    MediaType.audio("mpeg"), MediaType.video("ogg"), 
MediaType.audio("vnd.wave"),
+                    MediaType.audio("mp4"), MediaType.video("mp4"), 
MediaType.application("mp4"),
+                    MediaType.video("quicktime"))));
 
-    /**
-     * Constructs a new {@link PutObjectRequest} object to upload a file to the
-     * specified bucket and jobName. After constructing the request, users may
-     * optionally specify object metadata or a canned ACL as well.
-     *
-     * @param inputStream, null
-     *            The file to upload to Amazon S3.
-     * @param jobName
-     *            The unique job name for each job(UUID).
-     */
-    private void uploadFileToBucket(InputStream inputStream, String jobName)
-            throws TikaException {
-        PutObjectRequest request = new PutObjectRequest(this.bucketName,
-                jobName, inputStream, null);
-        try {
-            @SuppressWarnings("unused")
-            PutObjectResult response = amazonS3.putObject(request);
-        } catch (SdkClientException e) {
-            throw (new TikaException("File Upload to AWS Failed"));
-        }
-    }
 
-    /**
-     * Starts AWS Transcribe Job without language specification.
-     *
-     * @param inputStream
-     *            the source input stream.
-     * @return The transcribed string result, NULL if the job failed.
-     * @throws TikaException
-     *             When there is an error transcribing.
-     * @throws IOException
-     *             If an I/O exception of some sort has occurred.
-     */
     @Override
-    public String transcribe(InputStream inputStream)
-            throws TikaException, IOException {
-        if (!isAvailable())
-            return null;
-        String jobName = getJobKey();
-        uploadFileToBucket(inputStream, jobName);
-        StartTranscriptionJobRequest startTranscriptionJobRequest = new 
StartTranscriptionJobRequest();
-        Media media = new Media();
-        media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString());
-        
startTranscriptionJobRequest.withIdentifyLanguage(true).withMedia(media)
-        .withOutputBucketName(this.bucketName)
-        .withTranscriptionJobName(jobName)
-        .setRequestCredentialsProvider(credsProvider);
-        amazonTranscribeAsync
-        .startTranscriptionJob(startTranscriptionJobRequest);
-        return getTranscriptText(jobName);
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        if (!isAvailable) {
+            return Collections.EMPTY_SET;
+        }
+        return SUPPORTED_TYPES;
     }
 
     /**
      * Starts AWS Transcribe Job with language specification.
      *
-     * @param inputStream
-     *            the source input stream.
-     * @param sourceLanguage
-     *            <a href=
-     *            
"https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html";>AWS
-     *            Language Code</a> for the language used in the input media
-     *            file.
-     * @return The transcribed string result, NULL if the job failed.
-     * @throws TikaException
-     *             When there is an error transcribing.
-     * @throws IOException
-     *             If an I/O exception of some sort has occurred.
+     * @param stream   the source input stream.
+     * @param handler  handler to use
+     * @param metadata
+     * @param context  -- set the {@link LanguageCode} in the ParseContext if 
known
+     * @throws TikaException When there is an error transcribing.
+     * @throws IOException   If an I/O exception of some sort has occurred.
      * @see <a href=
-     *      
"https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html";>AWS
-     *      Language Code</a>
+     * 
"https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html";>AWS
+     * Language Code</a>
      */
     @Override
-    public String transcribe(InputStream inputStream, String sourceLanguage)
-            throws TikaException, IOException {
-        if (!isAvailable())
-            return null;
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata,
+                      ParseContext context) throws IOException, SAXException, 
TikaException {
+
+        if (!isAvailable) {
+            return;
+        }
         String jobName = getJobKey();
-        uploadFileToBucket(inputStream, jobName);
-        StartTranscriptionJobRequest startTranscriptionJobRequest = new 
StartTranscriptionJobRequest();
+        LanguageCode languageCode = context.get(LanguageCode.class);
+        uploadFileToBucket(stream, jobName);
+        StartTranscriptionJobRequest startTranscriptionJobRequest =
+                new StartTranscriptionJobRequest();
         Media media = new Media();
         media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString());
-        ((StartTranscriptionJobRequest) startTranscriptionJobRequest
-                .withMedia(media).withOutputBucketName(this.bucketName)
-                .withTranscriptionJobName(jobName)
-                .withRequestCredentialsProvider(credsProvider))
-        .withLanguageCode(
-                LanguageCode.fromValue(sourceLanguage));
-        amazonTranscribeAsync
-        .startTranscriptionJob(startTranscriptionJobRequest);
-        return getTranscriptText(jobName);
+        
startTranscriptionJobRequest.withMedia(media).withOutputBucketName(this.bucketName)
+                
.withTranscriptionJobName(jobName).setRequestCredentialsProvider(credsProvider);
+
+        if (languageCode != null) {
+            startTranscriptionJobRequest.withLanguageCode(languageCode);
+        } else {
+            startTranscriptionJobRequest.withIdentifyLanguage(true);
+        }
+        
amazonTranscribeAsync.startTranscriptionJob(startTranscriptionJobRequest);
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        String text = getTranscriptText(jobName);
+        xhtml.startElement("p");
+        xhtml.characters(text);
+        xhtml.endElement("p");
+        xhtml.endDocument();
+
     }
 
+
     /**
      * @return true if this Transcriber is probably able to transcribe right
-     *         now.
+     * now.
      * @since Tika 2.1
      */
-    @Override
     public boolean isAvailable() {
         return this.isAvailable;
     }
@@ -253,10 +180,10 @@ public class AmazonTranscribe implements Transcriber {
     /**
      * Sets the client Id for the transcriber API.
      *
-     * @param id
-     *            The ID to set.
+     * @param id The ID to set.
      */
-    public void setId(String id) {
+    @Field
+    public void setClientId(String id) {
         this.clientId = id;
         this.isAvailable = checkAvailable();
     }
@@ -264,10 +191,10 @@ public class AmazonTranscribe implements Transcriber {
     /**
      * Sets the client secret for the transcriber API.
      *
-     * @param secret
-     *            The secret to set.
+     * @param secret The secret to set.
      */
-    public void setSecret(String secret) {
+    @Field
+    public void setClientSecret(String secret) {
         this.clientSecret = secret;
         this.isAvailable = checkAvailable();
     }
@@ -275,89 +202,116 @@ public class AmazonTranscribe implements Transcriber {
     /**
      * Sets the client secret for the transcriber API.
      *
-     * @param bucket
-     *            The bucket to set.
+     * @param bucket The bucket to set.
      */
+    @Field
     public void setBucket(String bucket) {
         this.bucketName = bucket;
         this.isAvailable = checkAvailable();
     }
 
+    @Field
+    public void setRegion(String region) {
+        this.region = region;
+        this.isAvailable = checkAvailable();
+    }
+
     /**
      * Private method check if the service is available.
      *
      * @return if the service is available
      */
     private boolean checkAvailable() {
-        return clientId != null && !clientId.equals(DEFAULT_ID)
-                && clientSecret != null && !clientSecret.equals(DEFAULT_SECRET)
-                && bucketName != null && !bucketName.equals(DEFAULT_BUCKET);
+        return clientId != null && clientSecret != null && bucketName != null;
+    }
+
+    /**
+     * private method to get a unique job key.
+     *
+     * @return unique job key.
+     */
+    private String getJobKey() {
+        return UUID.randomUUID().toString();
+    }
+
+    /**
+     * Constructs a new {@link PutObjectRequest} object to upload a file to the
+     * specified bucket and jobName. After constructing the request, users may
+     * optionally specify object metadata or a canned ACL as well.
+     *
+     * @param inputStream, null
+     *                     The file to upload to Amazon S3.
+     * @param jobName      The unique job name for each job(UUID).
+     */
+    private void uploadFileToBucket(InputStream inputStream, String jobName) 
throws TikaException {
+        PutObjectRequest request =
+                new PutObjectRequest(this.bucketName, jobName, inputStream, 
null);
+        try {
+            @SuppressWarnings("unused") PutObjectResult response = 
amazonS3.putObject(request);
+        } catch (SdkClientException e) {
+            throw (new TikaException("File Upload to AWS Failed"));
+        }
     }
 
     /**
      * Gets Transcription result from AWS S3 bucket given the jobName.
      *
-     * @param fileNameS3
-     *            The path of the file to upload to Amazon S3.
+     * @param fileNameS3 The path of the file to upload to Amazon S3.
      * @return The transcribed string result, NULL if the job failed.
-     * @throws IOException possible reasons include (i) an End Event is not 
received
-     * from AWS S3 SelectObjectContentResult operation and (ii) a parse 
exception
-     * whilst processing JSON from the AWS S3 SelectObjectContentResult 
operation.
-     * @throws SdkClientException a AWS-specific exception related to 
SelectObjectContentResult
-     * operation.
+     * @throws IOException            possible reasons include (i) an End 
Event is not received
+     *                                from AWS S3 SelectObjectContentResult 
operation and (ii) a parse exception
+     *                                whilst processing JSON from the AWS S3 
SelectObjectContentResult operation.
+     * @throws SdkClientException     a AWS-specific exception related to 
SelectObjectContentResult
+     *                                operation.
      * @throws AmazonServiceException possibly thrown if there is an issue 
selecting object content
-     * from AWS S3 objects.
+     *                                from AWS S3 objects.
      */
-    private String getTranscriptText(String fileNameS3) throws 
AmazonServiceException, SdkClientException, IOException {
-        TranscriptionJob transcriptionJob = retrieveObjectWhenJobCompleted(
-                fileNameS3);
+    private String getTranscriptText(String fileNameS3)
+            throws AmazonServiceException, SdkClientException, IOException {
+        TranscriptionJob transcriptionJob = 
retrieveObjectWhenJobCompleted(fileNameS3);
         String text = null;
         if (transcriptionJob != null && !TranscriptionJobStatus.FAILED.name()
                 .equals(transcriptionJob.getTranscriptionJobStatus())) {
-            InputSerialization inputSerialization = new 
InputSerialization().withJson(new JSONInput().withType(JSONType.DOCUMENT))
-                    .withCompressionType(CompressionType.NONE);
-            OutputSerialization outputSerialization = new 
OutputSerialization().withJson(new JSONOutput());
-            SelectObjectContentRequest request = new 
SelectObjectContentRequest()
-                    .withBucketName(this.bucketName).withKey(fileNameS3 + 
".json")
-                    .withExpression("Select 
s.results.transcripts[0].transcript from S3Object s")//WHERE transcript IS NOT 
MISSING
-                    
.withExpressionType(ExpressionType.SQL).withRequestCredentialsProvider(credsProvider);
+            InputSerialization inputSerialization =
+                    new InputSerialization().withJson(new 
JSONInput().withType(JSONType.DOCUMENT))
+                            .withCompressionType(CompressionType.NONE);
+            OutputSerialization outputSerialization =
+                    new OutputSerialization().withJson(new JSONOutput());
+            SelectObjectContentRequest request =
+                    new 
SelectObjectContentRequest().withBucketName(this.bucketName)
+                            .withKey(fileNameS3 + ".json").withExpression(
+                            "Select s.results.transcripts[0].transcript from 
S3Object s")
+                            //WHERE transcript IS NOT MISSING
+                            .withExpressionType(ExpressionType.SQL)
+                            .withRequestCredentialsProvider(credsProvider);
             request.setInputSerialization(inputSerialization);
             request.setOutputSerialization(outputSerialization);
 
             final AtomicBoolean isResultComplete = new AtomicBoolean(false);
 
-            try (SelectObjectContentResult result = amazonS3
-                    .selectObjectContent(request)) {
+            try (SelectObjectContentResult result = 
amazonS3.selectObjectContent(request)) {
                 InputStream resultInputStream = result.getPayload()
-                        .getRecordsInputStream(
-                                new SelectObjectContentEventVisitor() {
-                                    @Override
-                                    public void visit(
-                                            
SelectObjectContentEvent.StatsEvent event) {
-                                        LOG.debug(
-                                                "Received Stats, Bytes 
Scanned: "
-                                                        + event.getDetails()
-                                                        .getBytesScanned()
-                                                        + " Bytes Processed: "
-                                                        + event.getDetails()
-                                                        .getBytesProcessed());
-                                    }
-
-                                    /*
-                                     * An End Event informs that the request 
has
-                                     * finished successfully.
-                                     */
-                                    @Override
-                                    public void visit(
-                                            SelectObjectContentEvent.EndEvent 
event) {
-                                        isResultComplete.set(true);
-                                        LOG.debug(
-                                                "Received End Event. Result is 
complete.");
-                                    }
-                                });
+                        .getRecordsInputStream(new 
SelectObjectContentEventVisitor() {
+                            @Override
+                            public void 
visit(SelectObjectContentEvent.StatsEvent event) {
+                                LOG.debug("Received Stats, Bytes Scanned: " +
+                                        event.getDetails().getBytesScanned() +
+                                        " Bytes Processed: " +
+                                        
event.getDetails().getBytesProcessed());
+                            }
+
+                            /*
+                             * An End Event informs that the request has
+                             * finished successfully.
+                             */
+                            @Override
+                            public void 
visit(SelectObjectContentEvent.EndEvent event) {
+                                isResultComplete.set(true);
+                                LOG.debug("Received End Event. Result is 
complete.");
+                            }
+                        });
                 text = new BufferedReader(
-                        new InputStreamReader(resultInputStream, 
StandardCharsets.UTF_8))
-                        .lines()
+                        new InputStreamReader(resultInputStream, 
StandardCharsets.UTF_8)).lines()
                         .collect(Collectors.joining("\n"));
             }
             /*
@@ -383,24 +337,62 @@ public class AmazonTranscribe implements Transcriber {
     /**
      * Private helper function to get object from s3.
      *
-     * @param jobName
-     *            The unique job name for each job(UUID).
+     * @param jobName The unique job name for each job(UUID).
      * @return TranscriptionJob object
      */
     private TranscriptionJob retrieveObjectWhenJobCompleted(String jobName) {
         GetTranscriptionJobRequest getTranscriptionJobRequest = new 
GetTranscriptionJobRequest();
-        getTranscriptionJobRequest
-        .withRequestCredentialsProvider(credsProvider);
+        
getTranscriptionJobRequest.withRequestCredentialsProvider(credsProvider);
         getTranscriptionJobRequest.setTranscriptionJobName(jobName);
         while (true) {
-            GetTranscriptionJobResult innerResult = amazonTranscribeAsync
-                    .getTranscriptionJob(getTranscriptionJobRequest);
-            String status = innerResult.getTranscriptionJob()
-                    .getTranscriptionJobStatus();
-            if (TranscriptionJobStatus.COMPLETED.name().equals(status)
-                    || TranscriptionJobStatus.FAILED.name().equals(status)) {
+            GetTranscriptionJobResult innerResult =
+                    
amazonTranscribeAsync.getTranscriptionJob(getTranscriptionJobRequest);
+            String status = 
innerResult.getTranscriptionJob().getTranscriptionJobStatus();
+            if (TranscriptionJobStatus.COMPLETED.name().equals(status) ||
+                    TranscriptionJobStatus.FAILED.name().equals(status)) {
                 return innerResult.getTranscriptionJob();
             }
         }
     }
-}
\ No newline at end of file
+
+    @Override
+    public void initialize(Map<String, Param> params) throws 
TikaConfigException {
+        if (!checkAvailable()) {
+            return;
+        }
+
+        try {
+            BasicAWSCredentials creds = new BasicAWSCredentials(this.clientId, 
this.clientSecret);
+            this.credsProvider = new AWSStaticCredentialsProvider(creds);
+            if (region != null) {
+                this.amazonS3 = 
AmazonS3ClientBuilder.standard().withCredentials(credsProvider)
+                        .withRegion(this.region).build();
+            } else {
+                this.amazonS3 =
+                        
AmazonS3ClientBuilder.standard().withCredentials(credsProvider).build();
+
+            }
+            if (!this.amazonS3.doesBucketExistV2(this.bucketName)) {
+                try {
+                    amazonS3.createBucket(this.bucketName);
+                } catch (AmazonS3Exception e) {
+                    throw new TikaConfigException("couldn't create bucket", e);
+                }
+            }
+            this.amazonTranscribeAsync =
+                    
AmazonTranscribeAsyncClientBuilder.standard().withCredentials(credsProvider)
+                            .withRegion(this.region).build();
+        } catch (Exception e) {
+            LOG.warn("Exception reading config file", e);
+            isAvailable = false;
+        }
+
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler problemHandler)
+            throws TikaConfigException {
+        //TODO alert user if they've gotten 1 or 2 out of three?
+        this.isAvailable = checkAvailable();
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java
 
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java
index 3b424f9..be4f76a 100644
--- 
a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java
+++ 
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java
@@ -14,17 +14,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.transcribe;
+package org.apache.tika.parser.transcribe.aws;
 
-import org.junit.Before;
+import java.io.InputStream;
+
+import com.amazonaws.services.transcribe.model.LanguageCode;
+import org.junit.BeforeClass;
 import org.junit.Ignore;
 import org.junit.Test;
 
-import java.io.FileInputStream;
-
-import static junit.framework.TestCase.assertNotNull;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 
 //TODO: Check the ACTUAL output of Amazon Transcribe.
 
@@ -33,13 +35,17 @@ import static org.junit.Assert.fail;
  * 1) Tests that transcribe functions properly when it is given just a 
filepath.
  * 2) Both audio (mp3) and video (mp4) files are used in these tests.
  */
-@Ignore("Ignore until finalize AmazonTransribe Interface & build Tika")
-public class AmazonTranscribeTest {
-    AmazonTranscribe transcriber;
+@Ignore("Ignore until finalize AmazonTrancsribe Interface & build Tika")
+public class AmazonTranscribeTest extends TikaTest {
+
+    static Parser PARSER;
 
-    @Before
-    public void setUp() {
-        transcriber = new AmazonTranscribe();
+    @BeforeClass
+    public static void setUp() throws Exception {
+        try (InputStream is = AmazonTranscribeTest.class
+                .getResourceAsStream("tika-config-aws-transcribe.xml")) {
+            PARSER = new TikaConfig(is).getParser();
+        }
     }
 
     /**
@@ -47,23 +53,12 @@ public class AmazonTranscribeTest {
      * The source language of the file is en-US (English - United States)
      */
     @Test
-    public void testAmazonTranscribeAudio_enUS() {
-        String audioFilePath = 
"src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3";
+    public void testAmazonTranscribeAudio_enUS() throws Exception {
+        ParseContext context = new ParseContext();
+        context.set(LanguageCode.class, LanguageCode.EnUS);
+        String xml = getXML("en-US_(A_Little_Bottle_Of_Water).mp3", PARSER, 
context).xml;
         String expected = "a little bottle of water.";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath), "en-US");
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        assertContains(expected, xml);
     }
 
     /**
@@ -71,23 +66,10 @@ public class AmazonTranscribeTest {
      * The source language of the file is en-US (English - United States)
      */
     @Test
-    public void testAmazonTranscribeUnknownAudio_enUS() {
-        String audioFilePath = 
"src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3";
+    public void testAmazonTranscribeUnknownAudio_enUS() throws Exception {
+        String xml = getXML("en-US_(A_Little_Bottle_Of_Water).mp3", 
PARSER).xml;
         String expected = "a little bottle of water.";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath));
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        assertContains(expected, xml);
     }
 
     /**
@@ -95,23 +77,12 @@ public class AmazonTranscribeTest {
      * The source language of the file is en-US (English - United States)
      */
     @Test
-    public void testAmazonTranscribeVideo_enUS() {
-        String videoFilePath = "en-US_(Hi).mp4";
+    public void testAmazonTranscribeVideo_enUS() throws Exception {
         String expected = "Hi";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(videoFilePath), "en-US");
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        ParseContext context = new ParseContext();
+        context.set(LanguageCode.class, LanguageCode.EnUS);
+        String xml = getXML("en-US_(Hi).mp4", PARSER, context).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -119,23 +90,10 @@ public class AmazonTranscribeTest {
      * The source language of the file is en-US (English - United States)
      */
     @Test
-    public void testAmazonTranscribeUnknownVideo_enUS() {
-        String videoFilePath = "en-US_(Hi).mp4";
+    public void testAmazonTranscribeUnknownVideo_enUS() throws Exception {
         String expected = "Hi";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(videoFilePath));
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        String xml = getXML("en-US_(Hi).mp4", PARSER).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -143,23 +101,13 @@ public class AmazonTranscribeTest {
      * The source language of the file is en-GB (English - Great Britain)
      */
     @Test
-    public void testAmazonTranscribeAudio_enGB() {
-        String audioFilePath = 
"src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3";
+    public void testAmazonTranscribeAudio_enGB() throws Exception {
+        String file = "en-GB_(A_Little_Bottle_Of_Water).mp3";
         String expected = "a little bottle of water.";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath), "en-GB");
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        ParseContext context = new ParseContext();
+        context.set(LanguageCode.class, LanguageCode.EnGB);
+        String xml = getXML(file, PARSER, context).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -167,23 +115,11 @@ public class AmazonTranscribeTest {
      * The source language of the file is en-GB (English - Great Britain)
      */
     @Test
-    public void testAmazonTranscribeUnknownAudio_enGB() {
-        String audioFilePath = 
"src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3";
+    public void testAmazonTranscribeUnknownAudio_enGB() throws Exception {
+        String file = "en-GB_(A_Little_Bottle_Of_Water).mp3";
         String expected = "a little bottle of water.";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath));
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        String xml = getXML(file, PARSER).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -191,23 +127,13 @@ public class AmazonTranscribeTest {
      * The source language of the file is en-AU (English - Australia)
      */
     @Test
-    public void testAmazonTranscribeAudio_enAU() {
-        String source = 
"src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3";
+    public void testAmazonTranscribeAudio_enAU() throws Exception {
+        String file = "en-AU_(A_Little_Bottle_Of_Water).mp3";
         String expected = "a little bottle of water.";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new FileInputStream(source), 
"en-AU");
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        ParseContext context = new ParseContext();
+        context.set(LanguageCode.class, LanguageCode.EnAU);
+        String xml = getXML(file, PARSER, context).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -215,23 +141,11 @@ public class AmazonTranscribeTest {
      * The source language of the file is en-AU (English - Australian)
      */
     @Test
-    public void testAmazonTranscribeUnknownAudio_enAU() {
-        String videoFilePath = 
"src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3";
+    public void testAmazonTranscribeUnknownAudio_enAU() throws Exception {
+        String file = "en-AU_(A_Little_Bottle_Of_Water).mp3";
         String expected = "a little bottle of water.";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(videoFilePath));
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        String xml = getXML(file, PARSER).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -239,23 +153,13 @@ public class AmazonTranscribeTest {
      * The source language of the file is de-DE (German)
      */
     @Test
-    public void testAmazonTranscribeAudio_deDE() {
-        String audioFilePath = 
"src/test/resources/de-DE_(We_Are_At_School_x2).mp3";
+    public void testAmazonTranscribeAudio_deDE() throws Exception {
+        String file = "de-DE_(We_Are_At_School_x2).mp3";
         String expected = "Wir sind in der Schule. Wir sind in der Schule.";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath), "de-DE");
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        ParseContext context = new ParseContext();
+        context.set(LanguageCode.class, LanguageCode.DeDE);
+        String xml = getXML(file, PARSER, context).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -263,23 +167,11 @@ public class AmazonTranscribeTest {
      * The source language of the file is de-DE (German)
      */
     @Test
-    public void testAmazonTranscribeUnknownAudio_deDE() {
-        String audioFilePath = 
"src/test/resources/de-DE_(We_Are_At_School_x2).mp3";
+    public void testAmazonTranscribeUnknownAudio_deDE() throws Exception {
+        String file = "de-DE_(We_Are_At_School_x2).mp3";
         String expected = "Wir sind in der Schule. Wir sind in der Schule.";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath));
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        String xml = getXML(file, PARSER).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -287,23 +179,13 @@ public class AmazonTranscribeTest {
      * The source language of the file is it-IT (Italian)
      */
     @Test
-    public void testAmazonTranscribeAudio_itIT() {
-        String audioFilePath = 
"src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3";
+    public void testAmazonTranscribeAudio_itIT() throws Exception {
+        String file = "it-IT_(We_Are_Having_Class_x2).mp3";
         String expected = "stiamo facendo lezione. stiamo facendo lezione.";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath), "it-IT");
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        ParseContext context = new ParseContext();
+        context.set(LanguageCode.class, LanguageCode.ItIT);
+        String xml = getXML(file, PARSER, context).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -311,23 +193,11 @@ public class AmazonTranscribeTest {
      * The source language of the file is it-IT (Italian)
      */
     @Test
-    public void testAmazonTranscribeUnknownAudio_itIT() {
-        String audioFilePath = 
"src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3";
+    public void testAmazonTranscribeUnknownAudio_itIT() throws Exception {
+        String file = "it-IT_(We_Are_Having_Class_x2).mp3";
         String expected = "stiamo facendo lezione. stiamo facendo lezione.";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath));
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        String xml = getXML(file, PARSER).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -335,23 +205,14 @@ public class AmazonTranscribeTest {
      * The source language of the file is ja-JP (Japanese)
      */
     @Test
-    public void testAmazonTranscribeAudio_jaJP() {
-        String audioFilePath = 
"src/test/resources/ja-JP_(We_Are_At_School).mp3";
+    public void testAmazonTranscribeAudio_jaJP() throws Exception {
+        String file = "ja-JP_(We_Are_At_School).mp3";
         String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu
-        String result;
+        ParseContext context = new ParseContext();
+        context.set(LanguageCode.class, LanguageCode.JaJP);
+        String xml = getXML(file, PARSER, context).xml;
+        assertContains(expected, xml);
 
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath), "ja-JP");
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
     }
 
     /**
@@ -359,23 +220,11 @@ public class AmazonTranscribeTest {
      * The source language of the file is ja-JP (Japanese)
      */
     @Test
-    public void testAmazonTranscribeUnknownAudio_jaJP() {
-        String audioFilePath = 
"src/test/resources/ja-JP_(We_Are_At_School).mp3";
+    public void testAmazonTranscribeUnknownAudio_jaJP() throws Exception {
+        String file = "ja-JP_(We_Are_At_School).mp3";
         String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath));
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        String xml = getXML(file, PARSER).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -383,23 +232,13 @@ public class AmazonTranscribeTest {
      * The source language of the file is ko-KR (Korean)
      */
     @Test
-    public void testAmazonTranscribeAudio_koKR() {
-        String audioFilePath = 
"src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3";
+    public void testAmazonTranscribeAudio_koKR() throws Exception {
+        String file = "ko-KR_(We_Are_Having_Class_x2).mp3";
         String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath), "ko-KR");
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        ParseContext context = new ParseContext();
+        context.set(LanguageCode.class, LanguageCode.KoKR);
+        String xml = getXML(file, PARSER, context).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -407,23 +246,11 @@ public class AmazonTranscribeTest {
      * The source language of the file is ko-KR (Korean)
      */
     @Test
-    public void testAmazonTranscribeUnknownAudio_koKR() {
-        String audioFilePath = 
"src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3";
+    public void testAmazonTranscribeUnknownAudio_koKR() throws Exception {
+        String file = "ko-KR_(We_Are_Having_Class_x2).mp3";
         String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath));
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        String xml = getXML(file, PARSER).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -431,24 +258,14 @@ public class AmazonTranscribeTest {
      * The source language of the file is ko-KR (Korean)
      */
     @Test
-    public void testAmazonTranscribeVideo_koKR() {
-        String source = "src/test/resources/ko-KR_(Annyeonghaseyo).mp4";
+    public void testAmazonTranscribeVideo_koKR() throws Exception {
+        String file = "ko-KR_(Annyeonghaseyo).mp4";
         //TODO: Check whether output is Annyeonghaseyo or 안녕하세요
         String expected = "Annyeonghaseyo";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new FileInputStream(source), 
"ko-KR");
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        ParseContext context = new ParseContext();
+        context.set(LanguageCode.class, LanguageCode.KoKR);
+        String xml = getXML(file, PARSER, context).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -456,24 +273,12 @@ public class AmazonTranscribeTest {
      * The source language of the file is ko-KR (Korean)
      */
     @Test
-    public void testAmazonTranscribeUnknownVideo_koKR() {
-        String source = "src/test/resources/ko-KR_(Annyeonghaseyo).mp4";
+    public void testAmazonTranscribeUnknownVideo_koKR() throws Exception {
+        String file = "ko-KR_(Annyeonghaseyo).mp4";
         //TODO: Check whether output is Annyeonghaseyo or 안녕하세요
         String expected = "Annyeonghaseyo";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new FileInputStream(source));
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        String xml = getXML(file, PARSER).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -481,23 +286,13 @@ public class AmazonTranscribeTest {
      * The source language of the file is pt-BR (Portuguese - Brazil)
      */
     @Test
-    public void testAmazonTranscribeAudio_ptBR() {
-        String audioFilePath = 
"src/test/resources/pt-BR_(We_Are_At_School).mp3";
+    public void testAmazonTranscribeAudio_ptBR() throws Exception {
+        String file = "pt-BR_(We_Are_At_School).mp3";
         String expected = "nós estamos na escola.";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath), "pt-BR");
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        ParseContext context = new ParseContext();
+        context.set(LanguageCode.class, LanguageCode.PtBR);
+        String xml = getXML(file, PARSER, context).xml;
+        assertContains(expected, xml);
     }
 
     /**
@@ -505,23 +300,11 @@ public class AmazonTranscribeTest {
      * The source language of the file is pt-BR (Portuguese - Brazil)
      */
     @Test
-    public void testAmazonTranscribeUnknownAudio_ptBR() {
-        String audioFilePath = 
"src/test/resources/pt-BR_(We_Are_At_School).mp3";
+    public void testAmazonTranscribeUnknownAudio_ptBR() throws Exception {
+        String file = "pt-BR_(We_Are_At_School).mp3";
         String expected = "nós estamos na escola.";
-        String result;
-
-        if (transcriber.isAvailable()) {
-            try {
-                result = transcriber.transcribe(new 
FileInputStream(audioFilePath));
-                assertNotNull(result);
-                assertEquals("Result: [" + result
-                        + "]: not equal to expected: [" + expected + "]",
-                    expected, result);
-            } catch (Exception e) {
-                e.printStackTrace();
-                fail(e.getMessage());
-            }
-        }
+        String xml = getXML(file, PARSER).xml;
+        assertContains(expected, xml);
     }
 
 }
diff --git 
a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/tika-config-aws-transcribe.xml
 
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/tika-config-aws-transcribe.xml
new file mode 100644
index 0000000..fb23d38
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/tika-config-aws-transcribe.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser"/>
+    <parser class="org.apache.tika.parser.transcribe.aws.AmazonTranscribe">
+      <params>
+        <param name="bucket" type="string">bucket</param>
+        <param name="clientId" type="string">clientId</param>
+        <param name="clientSecret" type="string">clientSecret</param>
+      </params>
+    </parser>
+  </parsers>
+</properties>
diff --git 
a/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator
 
b/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator
deleted file mode 100644
index 1256ab6..0000000
--- 
a/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator
+++ /dev/null
@@ -1,16 +0,0 @@
-#  Licensed to the Apache Software Foundation (ASF) under one or more
-#  contributor license agreements.  See the NOTICE file distributed with
-#  this work for additional information regarding copyright ownership.
-#  The ASF licenses this file to You under the Apache License, Version 2.0
-#  (the "License"); you may not use this file except in compliance with
-#  the License.  You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-org.apache.tika.language.translate.amazontranscribe
diff --git 
a/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties
 
b/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties
deleted file mode 100644
index 043a66f..0000000
--- 
a/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-transcribe.AWS_ACCESS_KEY=dummy_key
-transcribe.AWS_SECRET_KEY=dummy_key
-transcribe.BUCKET_NAME=dummy_name

Reply via email to