This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit ee5a82ff2e79a7aa1b5a364037fe67efe4ae0be8 Author: tallison <[email protected]> AuthorDate: Tue May 18 08:49:07 2021 -0400 Revert "TIKA-3384 -- convert transcribe to a traditional parser" This reverts commit 2c951a35e57cf6624457798d51c1b8cbffff0f7b. --- pom.xml | 1 + .../org/apache/tika/transcribe/Transcriber.java | 60 +++ .../tika/example/TranscribeTranslateExample.java | 83 ++-- tika-parsers/tika-parsers-ml/pom.xml | 1 - .../parser/transcribe/aws/AmazonTranscribe.java | 398 ---------------- .../transcribe/aws/AmazonTranscribeTest.java | 310 ------------ .../test/resources/tika-config-transcribe-aws.xml | 32 -- .../pom.xml | 47 +- .../apache/tika/transcribe/AmazonTranscribe.java | 406 ++++++++++++++++ .../org.apache.tika.language.translate.Translator | 16 + .../transcribe.amazon.properties | 18 + .../tika/transcribe/AmazonTranscribeTest.java | 527 +++++++++++++++++++++ .../src/test/resources}/ShortAudioSampleFrench.mp3 | Bin .../resources}/de-DE_(We_Are_At_School_x2).mp3 | Bin .../en-AU_(A_Little_Bottle_Of_Water).mp3 | Bin .../en-GB_(A_Little_Bottle_Of_Water).mp3 | Bin .../en-US_(A_Little_Bottle_Of_Water).mp3 | Bin .../src/test/resources}/en-US_(Hi).mp4 | Bin .../resources}/it-IT_(We_Are_Having_Class_x2).mp3 | Bin .../test/resources}/ja-JP_(We_Are_At_School).mp3 | Bin .../src/test/resources}/ko-KR_(Annyeonghaseyo).mp4 | Bin .../resources}/ko-KR_(We_Are_Having_Class_x2).mp3 | Bin .../test/resources}/pt-BR_(We_Are_At_School).mp3 | Bin 23 files changed, 1069 insertions(+), 830 deletions(-) diff --git a/pom.xml b/pom.xml index d0e43d4..f8c6591 100644 --- a/pom.xml +++ b/pom.xml @@ -52,6 +52,7 @@ <module>tika-translate</module> <module>tika-example</module> <module>tika-java7</module> + <module>tika-transcribe</module> </modules> <profiles> diff --git a/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java b/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java new file mode 100644 index 0000000..3546256 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.transcribe; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.exception.TikaException; + +/** + * Interface for Transcriber services. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-94">TIKA-94</a> + * @since Tika 2.1 + */ +public interface Transcriber { + /** + * Transcribe the given file. + * + * @param inputStream the source input stream. + * @return The transcribed string result, NULL if the job failed. + * @throws TikaException When there is an error transcribing. + * @throws IOException If an I/O exception of some sort has occurred. + * @since 2.1 + */ + public String transcribe(InputStream inputStream) throws TikaException, IOException; + + /** + * Transcribe the given the file and the source language. + * + * @param inputStream the source input stream. + * @param sourceLanguage The language code for the language used in the input media file. + * @return The transcribed string result, NULL if the job failed. + * @throws TikaException When there is an error transcribing. + * @throws IOException If an I/O exception of some sort has occurred. + * @since 2.1 + */ + public String transcribe(InputStream inputStream, String sourceLanguage) throws TikaException, IOException; + + /** + * @return true if this Transcriber is probably able to transcribe right now. + * @since Tika 2.1 + */ + public boolean isAvailable(); +} diff --git a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java index a90d322..12dd7e5 100644 --- a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java @@ -17,23 +17,22 @@ package org.apache.tika.example; -import java.nio.file.Path; -import java.nio.file.Paths; +import java.io.FileInputStream; -import org.apache.tika.Tika; -import org.apache.tika.config.TikaConfig; import org.apache.tika.language.translate.GoogleTranslator; import org.apache.tika.language.translate.Translator; +import org.apache.tika.transcribe.AmazonTranscribe; +import org.apache.tika.transcribe.Transcriber; /** * This example demonstrates primitive logic for * chaining Tika API calls. In this case translation - * could be considered as a downstream process to + * could be considered as a downstream process to * transcription. * We simply pass the output of - * a call to {@link Tika#parseToString(Path)} - * into {@link Translator#translate(String, String)}. - * The {@link GoogleTranslator} is configured with a target + * a call to {@link Transcriber#transcribe(java.io.InputStream)} + * into {@link Translator#translate(String, String)}. + * The {@link GoogleTranslator} is configured with a target * language of "en-US". * @author lewismc * @@ -43,7 +42,7 @@ public class TranscribeTranslateExample { /** * Use {@link GoogleTranslator} to execute translation on * input data. This implementation needs configured as explained in the Javadoc. - * In this implementation, Google will try to guess the input language. The target + * In this implementation, Google will try to guess the input language. The target * language is "en-US". * @param text input text to translate. * @return translated text String. @@ -62,55 +61,43 @@ public class TranscribeTranslateExample { } /** - * Use {@link org.apache.tika.parser.transcribe.aws.AmazonTranscribe} to execute transcription - * on input data. - * This implementation needs to be configured as explained in the Javadoc. + * Use {@link AmazonTranscribe} to execute transcription on input data. + * This implementation needs configured as explained in the Javadoc. * @param file the name of the file (which needs to be on the Java Classpath) to transcribe. * @return transcribed text. */ - public static String amazonTranscribe(Path tikaConfig, Path file) throws Exception { - return new Tika(new TikaConfig(tikaConfig)).parseToString(file); + public static String amazonTranscribe(String file) { + String filePath = TranscribeTranslateExample.class.getClassLoader().getResource(file).getPath(); + String result = null; + Transcriber transcriber = new AmazonTranscribe(); + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(filePath)); + } catch (Exception e) { + e.printStackTrace(); + } + } + return result; } /** * Main method to run this example. This program can be invoked as follows * <ol> - * <li><code>transcribe-translate ${tika-config.xml} ${file}</code>; which executes both - * transcription then translation on the given resource, or - * <li><code>transcribe ${tika-config.xml} ${file}</code>; which executes only translation</li> - * @param args either of the commands described above and the input file - * (which needs to be on the Java Classpath). - * - * - * - * ${tika-config.xml} must include credentials for aws and a temporary storage bucket: - * <pre> - * {@code - * <properties> - * <parsers> - * <parser class="org.apache.tika.parser.DefaultParser"/> - * <parser class="org.apache.tika.parser.transcribe.aws.AmazonTranscribe"> - * <params> - * <param name="bucket" type="string">bucket</param> - * <param name="clientId" type="string">clientId</param> - * <param name="clientSecret" type="string">clientSecret</param> - * </params> - * </parser> - * </parsers> - * </properties> - * } - * </pre> + * <li><code>transcribe-translate ${file}</code>; which executes both + * transcription then translation on the given resource, or + * <li><code>transcribe ${file}</code>; which executes only translation</li> + * @param args either of the commands described above and the input file + * (which needs to be on the Java Classpath). */ - public static void main (String[] args) throws Exception { + public static void main (String[] args) { String text = null; - if (args.length > 1) { - if ("transcribe-translate".equals(args[1])) { - text = googleTranslateToEnglish(amazonTranscribe(Paths.get(args[0]), - Paths.get(args[1]))); - System.out.print("Transcription and translation successful!\nEXTRACTED TEXT: " + text); - } else if ("transcribe".equals(args[1])) { - text = amazonTranscribe(Paths.get(args[0]), Paths.get(args[1])); - System.out.print("Transcription successful!\nEXTRACTED TEXT: " + text); + if (args.length != 0) { + if ("transcribe-translate".equals(args[0])) { + text = googleTranslateToEnglish(amazonTranscribe(args[1])); + System.out.print("Transcription and translation successful!\nEXTRAXCTED TEXT: " + text); + } else if ("transcribe".equals(args[0])) { + text = amazonTranscribe(args[1]); + System.out.print("Transcription successful!\nEXTRAXCTED TEXT: " + text); } else { System.out.print("Incorrect invocation, see Javadoc."); } diff --git a/tika-parsers/tika-parsers-ml/pom.xml b/tika-parsers/tika-parsers-ml/pom.xml index 2dcde9e..ba9bd38 100644 --- a/tika-parsers/tika-parsers-ml/pom.xml +++ b/tika-parsers/tika-parsers-ml/pom.xml @@ -40,7 +40,6 @@ <module>tika-age-recogniser</module> <module>tika-parser-advancedmedia-module</module> <module>tika-dl</module> - <module>tika-transcribe-aws</module> </modules> <build> diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java deleted file mode 100644 index 91e8452..0000000 --- a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.parser.transcribe.aws; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; -import java.util.UUID; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.stream.Collectors; - -import com.amazonaws.AmazonServiceException; -import com.amazonaws.SdkClientException; -import com.amazonaws.auth.AWSStaticCredentialsProvider; -import com.amazonaws.auth.BasicAWSCredentials; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3ClientBuilder; -import com.amazonaws.services.s3.model.AmazonS3Exception; -import com.amazonaws.services.s3.model.CompressionType; -import com.amazonaws.services.s3.model.ExpressionType; -import com.amazonaws.services.s3.model.InputSerialization; -import com.amazonaws.services.s3.model.JSONInput; -import com.amazonaws.services.s3.model.JSONOutput; -import com.amazonaws.services.s3.model.JSONType; -import com.amazonaws.services.s3.model.OutputSerialization; -import com.amazonaws.services.s3.model.PutObjectRequest; -import com.amazonaws.services.s3.model.PutObjectResult; -import com.amazonaws.services.s3.model.SelectObjectContentEvent; -import com.amazonaws.services.s3.model.SelectObjectContentEventVisitor; -import com.amazonaws.services.s3.model.SelectObjectContentRequest; -import com.amazonaws.services.s3.model.SelectObjectContentResult; -import com.amazonaws.services.transcribe.AmazonTranscribeAsync; -import com.amazonaws.services.transcribe.AmazonTranscribeAsyncClientBuilder; -import com.amazonaws.services.transcribe.model.GetTranscriptionJobRequest; -import com.amazonaws.services.transcribe.model.GetTranscriptionJobResult; -import com.amazonaws.services.transcribe.model.LanguageCode; -import com.amazonaws.services.transcribe.model.Media; -import com.amazonaws.services.transcribe.model.StartTranscriptionJobRequest; -import com.amazonaws.services.transcribe.model.TranscriptionJob; -import com.amazonaws.services.transcribe.model.TranscriptionJobStatus; -import org.json.simple.JSONObject; -import org.json.simple.parser.JSONParser; -import org.json.simple.parser.ParseException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - -import org.apache.tika.config.Field; -import org.apache.tika.config.Initializable; -import org.apache.tika.config.InitializableProblemHandler; -import org.apache.tika.config.Param; -import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AbstractParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.XHTMLContentHandler; - -/** - * <a href="https://aws.amazon.com/transcribe/">Amazon Transcribe</a> - * implementation. See Javadoc for configuration options. - * <p> - * Silently becomes unavailable when client keys are unavailable. - * - * <b>N.B.</b> it is not necessary to create the bucket before hand. - * This implementation will automatically create the bucket if one - * does not already exist, per the name defined above. - * - * @since Tika 2.0 - */ - -public class AmazonTranscribe extends AbstractParser implements Initializable { - private static final Logger LOG = LoggerFactory.getLogger(AmazonTranscribe.class); - private AmazonTranscribeAsync amazonTranscribeAsync; - private AmazonS3 amazonS3; - private String bucketName; - private String region; - private boolean isAvailable; // Flag for whether or not transcription is - // available. - private String clientId; - private String clientSecret; // Keys used for the API calls. - private AWSStaticCredentialsProvider credsProvider; - - //https://docs.aws.amazon.com/transcribe/latest/dg/input.html - protected static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( - new HashSet<>(Arrays.asList(MediaType.audio("x-flac"), MediaType.audio("mp3"), - MediaType.audio("mpeg"), MediaType.video("ogg"), MediaType.audio("vnd.wave"), - MediaType.audio("mp4"), MediaType.video("mp4"), MediaType.application("mp4"), - MediaType.video("quicktime")))); - - - @Override - public Set<MediaType> getSupportedTypes(ParseContext context) { - if (!isAvailable) { - return Collections.EMPTY_SET; - } - return SUPPORTED_TYPES; - } - - /** - * Starts AWS Transcribe Job with language specification. - * - * @param stream the source input stream. - * @param handler handler to use - * @param metadata - * @param context -- set the {@link LanguageCode} in the ParseContext if known - * @throws TikaException When there is an error transcribing. - * @throws IOException If an I/O exception of some sort has occurred. - * @see <a href= - * "https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS - * Language Code</a> - */ - @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { - - if (!isAvailable) { - return; - } - String jobName = getJobKey(); - LanguageCode languageCode = context.get(LanguageCode.class); - uploadFileToBucket(stream, jobName); - StartTranscriptionJobRequest startTranscriptionJobRequest = - new StartTranscriptionJobRequest(); - Media media = new Media(); - media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString()); - startTranscriptionJobRequest.withMedia(media).withOutputBucketName(this.bucketName) - .withTranscriptionJobName(jobName).setRequestCredentialsProvider(credsProvider); - - if (languageCode != null) { - startTranscriptionJobRequest.withLanguageCode(languageCode); - } else { - startTranscriptionJobRequest.withIdentifyLanguage(true); - } - amazonTranscribeAsync.startTranscriptionJob(startTranscriptionJobRequest); - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - xhtml.startDocument(); - String text = getTranscriptText(jobName); - xhtml.startElement("p"); - xhtml.characters(text); - xhtml.endElement("p"); - xhtml.endDocument(); - - } - - - /** - * @return true if this Transcriber is probably able to transcribe right - * now. - * @since Tika 2.1 - */ - public boolean isAvailable() { - return this.isAvailable; - } - - /** - * Sets the client Id for the transcriber API. - * - * @param id The ID to set. - */ - @Field - public void setClientId(String id) { - this.clientId = id; - this.isAvailable = checkAvailable(); - } - - /** - * Sets the client secret for the transcriber API. - * - * @param secret The secret to set. - */ - @Field - public void setClientSecret(String secret) { - this.clientSecret = secret; - this.isAvailable = checkAvailable(); - } - - /** - * Sets the client secret for the transcriber API. - * - * @param bucket The bucket to set. - */ - @Field - public void setBucket(String bucket) { - this.bucketName = bucket; - this.isAvailable = checkAvailable(); - } - - @Field - public void setRegion(String region) { - this.region = region; - this.isAvailable = checkAvailable(); - } - - /** - * Private method check if the service is available. - * - * @return if the service is available - */ - private boolean checkAvailable() { - return clientId != null && clientSecret != null && bucketName != null; - } - - /** - * private method to get a unique job key. - * - * @return unique job key. - */ - private String getJobKey() { - return UUID.randomUUID().toString(); - } - - /** - * Constructs a new {@link PutObjectRequest} object to upload a file to the - * specified bucket and jobName. After constructing the request, users may - * optionally specify object metadata or a canned ACL as well. - * - * @param inputStream, null - * The file to upload to Amazon S3. - * @param jobName The unique job name for each job(UUID). - */ - private void uploadFileToBucket(InputStream inputStream, String jobName) throws TikaException { - PutObjectRequest request = - new PutObjectRequest(this.bucketName, jobName, inputStream, null); - try { - @SuppressWarnings("unused") PutObjectResult response = amazonS3.putObject(request); - } catch (SdkClientException e) { - throw (new TikaException("File Upload to AWS Failed")); - } - } - - /** - * Gets Transcription result from AWS S3 bucket given the jobName. - * - * @param fileNameS3 The path of the file to upload to Amazon S3. - * @return The transcribed string result, NULL if the job failed. - * @throws IOException possible reasons include (i) an End Event is not received - * from AWS S3 SelectObjectContentResult operation and (ii) a parse exception - * whilst processing JSON from the AWS S3 SelectObjectContentResult operation. - * @throws SdkClientException a AWS-specific exception related to SelectObjectContentResult - * operation. - * @throws AmazonServiceException possibly thrown if there is an issue selecting object content - * from AWS S3 objects. - */ - private String getTranscriptText(String fileNameS3) - throws AmazonServiceException, SdkClientException, IOException { - TranscriptionJob transcriptionJob = retrieveObjectWhenJobCompleted(fileNameS3); - String text = null; - if (transcriptionJob != null && !TranscriptionJobStatus.FAILED.name() - .equals(transcriptionJob.getTranscriptionJobStatus())) { - InputSerialization inputSerialization = - new InputSerialization().withJson(new JSONInput().withType(JSONType.DOCUMENT)) - .withCompressionType(CompressionType.NONE); - OutputSerialization outputSerialization = - new OutputSerialization().withJson(new JSONOutput()); - SelectObjectContentRequest request = - new SelectObjectContentRequest().withBucketName(this.bucketName) - .withKey(fileNameS3 + ".json").withExpression( - "Select s.results.transcripts[0].transcript from S3Object s") - //WHERE transcript IS NOT MISSING - .withExpressionType(ExpressionType.SQL) - .withRequestCredentialsProvider(credsProvider); - request.setInputSerialization(inputSerialization); - request.setOutputSerialization(outputSerialization); - - final AtomicBoolean isResultComplete = new AtomicBoolean(false); - - try (SelectObjectContentResult result = amazonS3.selectObjectContent(request)) { - InputStream resultInputStream = result.getPayload() - .getRecordsInputStream(new SelectObjectContentEventVisitor() { - @Override - public void visit(SelectObjectContentEvent.StatsEvent event) { - LOG.debug("Received Stats, Bytes Scanned: " + - event.getDetails().getBytesScanned() + - " Bytes Processed: " + - event.getDetails().getBytesProcessed()); - } - - /* - * An End Event informs that the request has - * finished successfully. - */ - @Override - public void visit(SelectObjectContentEvent.EndEvent event) { - isResultComplete.set(true); - LOG.debug("Received End Event. Result is complete."); - } - }); - text = new BufferedReader( - new InputStreamReader(resultInputStream, StandardCharsets.UTF_8)).lines() - .collect(Collectors.joining("\n")); - } - /* - * The End Event indicates all matching records have been - * transmitted. If the End Event is not received, the results - * may be incomplete. - */ - if (!isResultComplete.get()) { - throw new IOException( - "S3 Select request was incomplete as End Event was not received."); - } - } - JSONParser parser = new JSONParser(); - JSONObject obj = null; - try { - obj = (JSONObject) parser.parse(text); - } catch (ParseException e) { - throw new IOException(e.getMessage(), e); - } - return obj.get("transcript").toString(); - } - - /** - * Private helper function to get object from s3. - * - * @param jobName The unique job name for each job(UUID). - * @return TranscriptionJob object - */ - private TranscriptionJob retrieveObjectWhenJobCompleted(String jobName) { - GetTranscriptionJobRequest getTranscriptionJobRequest = new GetTranscriptionJobRequest(); - getTranscriptionJobRequest.withRequestCredentialsProvider(credsProvider); - getTranscriptionJobRequest.setTranscriptionJobName(jobName); - while (true) { - GetTranscriptionJobResult innerResult = - amazonTranscribeAsync.getTranscriptionJob(getTranscriptionJobRequest); - String status = innerResult.getTranscriptionJob().getTranscriptionJobStatus(); - if (TranscriptionJobStatus.COMPLETED.name().equals(status) || - TranscriptionJobStatus.FAILED.name().equals(status)) { - return innerResult.getTranscriptionJob(); - } - } - } - - @Override - public void initialize(Map<String, Param> params) throws TikaConfigException { - if (!checkAvailable()) { - return; - } - - try { - BasicAWSCredentials creds = new BasicAWSCredentials(this.clientId, this.clientSecret); - this.credsProvider = new AWSStaticCredentialsProvider(creds); - if (region != null) { - this.amazonS3 = AmazonS3ClientBuilder.standard().withCredentials(credsProvider) - .withRegion(this.region).build(); - } else { - this.amazonS3 = - AmazonS3ClientBuilder.standard().withCredentials(credsProvider).build(); - - } - if (!this.amazonS3.doesBucketExistV2(this.bucketName)) { - try { - amazonS3.createBucket(this.bucketName); - } catch (AmazonS3Exception e) { - throw new TikaConfigException("couldn't create bucket", e); - } - } - this.amazonTranscribeAsync = - AmazonTranscribeAsyncClientBuilder.standard().withCredentials(credsProvider) - .withRegion(this.region).build(); - } catch (Exception e) { - LOG.warn("Exception reading config file", e); - isAvailable = false; - } - - } - - @Override - public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { - //TODO alert user if they've gotten 1 or 2 out of three? - this.isAvailable = checkAvailable(); - } -} diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java deleted file mode 100644 index be4f76a..0000000 --- a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.transcribe.aws; - -import java.io.InputStream; - -import com.amazonaws.services.transcribe.model.LanguageCode; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; - -import org.apache.tika.TikaTest; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; - -//TODO: Check the ACTUAL output of Amazon Transcribe. - -/** - * Tests tika-trancribe by creating an AmazonTranscribe() object. - * 1) Tests that transcribe functions properly when it is given just a filepath. - * 2) Both audio (mp3) and video (mp4) files are used in these tests. - */ -@Ignore("Ignore until finalize AmazonTrancsribe Interface & build Tika") -public class AmazonTranscribeTest extends TikaTest { - - static Parser PARSER; - - @BeforeClass - public static void setUp() throws Exception { - try (InputStream is = AmazonTranscribeTest.class - .getResourceAsStream("tika-config-aws-transcribe.xml")) { - PARSER = new TikaConfig(is).getParser(); - } - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is en-US (English - United States) - */ - @Test - public void testAmazonTranscribeAudio_enUS() throws Exception { - ParseContext context = new ParseContext(); - context.set(LanguageCode.class, LanguageCode.EnUS); - String xml = getXML("en-US_(A_Little_Bottle_Of_Water).mp3", PARSER, context).xml; - String expected = "a little bottle of water."; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is en-US (English - United States) - */ - @Test - public void testAmazonTranscribeUnknownAudio_enUS() throws Exception { - String xml = getXML("en-US_(A_Little_Bottle_Of_Water).mp3", PARSER).xml; - String expected = "a little bottle of water."; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is en-US (English - United States) - */ - @Test - public void testAmazonTranscribeVideo_enUS() throws Exception { - String expected = "Hi"; - ParseContext context = new ParseContext(); - context.set(LanguageCode.class, LanguageCode.EnUS); - String xml = getXML("en-US_(Hi).mp4", PARSER, context).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with a video file without passing in the source language. - * The source language of the file is en-US (English - United States) - */ - @Test - public void testAmazonTranscribeUnknownVideo_enUS() throws Exception { - String expected = "Hi"; - String xml = getXML("en-US_(Hi).mp4", PARSER).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is en-GB (English - Great Britain) - */ - @Test - public void testAmazonTranscribeAudio_enGB() throws Exception { - String file = "en-GB_(A_Little_Bottle_Of_Water).mp3"; - String expected = "a little bottle of water."; - ParseContext context = new ParseContext(); - context.set(LanguageCode.class, LanguageCode.EnGB); - String xml = getXML(file, PARSER, context).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is en-GB (English - Great Britain) - */ - @Test - public void testAmazonTranscribeUnknownAudio_enGB() throws Exception { - String file = "en-GB_(A_Little_Bottle_Of_Water).mp3"; - String expected = "a little bottle of water."; - String xml = getXML(file, PARSER).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is en-AU (English - Australia) - */ - @Test - public void testAmazonTranscribeAudio_enAU() throws Exception { - String file = "en-AU_(A_Little_Bottle_Of_Water).mp3"; - String expected = "a little bottle of water."; - ParseContext context = new ParseContext(); - context.set(LanguageCode.class, LanguageCode.EnAU); - String xml = getXML(file, PARSER, context).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is en-AU (English - Australian) - */ - @Test - public void testAmazonTranscribeUnknownAudio_enAU() throws Exception { - String file = "en-AU_(A_Little_Bottle_Of_Water).mp3"; - String expected = "a little bottle of water."; - String xml = getXML(file, PARSER).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is de-DE (German) - */ - @Test - public void testAmazonTranscribeAudio_deDE() throws Exception { - String file = "de-DE_(We_Are_At_School_x2).mp3"; - String expected = "Wir sind in der Schule. Wir sind in der Schule."; - ParseContext context = new ParseContext(); - context.set(LanguageCode.class, LanguageCode.DeDE); - String xml = getXML(file, PARSER, context).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is de-DE (German) - */ - @Test - public void testAmazonTranscribeUnknownAudio_deDE() throws Exception { - String file = "de-DE_(We_Are_At_School_x2).mp3"; - String expected = "Wir sind in der Schule. Wir sind in der Schule."; - String xml = getXML(file, PARSER).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is it-IT (Italian) - */ - @Test - public void testAmazonTranscribeAudio_itIT() throws Exception { - String file = "it-IT_(We_Are_Having_Class_x2).mp3"; - String expected = "stiamo facendo lezione. stiamo facendo lezione."; - ParseContext context = new ParseContext(); - context.set(LanguageCode.class, LanguageCode.ItIT); - String xml = getXML(file, PARSER, context).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is it-IT (Italian) - */ - @Test - public void testAmazonTranscribeUnknownAudio_itIT() throws Exception { - String file = "it-IT_(We_Are_Having_Class_x2).mp3"; - String expected = "stiamo facendo lezione. stiamo facendo lezione."; - String xml = getXML(file, PARSER).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is ja-JP (Japanese) - */ - @Test - public void testAmazonTranscribeAudio_jaJP() throws Exception { - String file = "ja-JP_(We_Are_At_School).mp3"; - String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu - ParseContext context = new ParseContext(); - context.set(LanguageCode.class, LanguageCode.JaJP); - String xml = getXML(file, PARSER, context).xml; - assertContains(expected, xml); - - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is ja-JP (Japanese) - */ - @Test - public void testAmazonTranscribeUnknownAudio_jaJP() throws Exception { - String file = "ja-JP_(We_Are_At_School).mp3"; - String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu - String xml = getXML(file, PARSER).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is ko-KR (Korean) - */ - @Test - public void testAmazonTranscribeAudio_koKR() throws Exception { - String file = "ko-KR_(We_Are_Having_Class_x2).mp3"; - String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda - ParseContext context = new ParseContext(); - context.set(LanguageCode.class, LanguageCode.KoKR); - String xml = getXML(file, PARSER, context).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is ko-KR (Korean) - */ - @Test - public void testAmazonTranscribeUnknownAudio_koKR() throws Exception { - String file = "ko-KR_(We_Are_Having_Class_x2).mp3"; - String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda - String xml = getXML(file, PARSER).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with a video file given the source language - * The source language of the file is ko-KR (Korean) - */ - @Test - public void testAmazonTranscribeVideo_koKR() throws Exception { - String file = "ko-KR_(Annyeonghaseyo).mp4"; - //TODO: Check whether output is Annyeonghaseyo or 안녕하세요 - String expected = "Annyeonghaseyo"; - ParseContext context = new ParseContext(); - context.set(LanguageCode.class, LanguageCode.KoKR); - String xml = getXML(file, PARSER, context).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an video file without passing in the source language. - * The source language of the file is ko-KR (Korean) - */ - @Test - public void testAmazonTranscribeUnknownVideo_koKR() throws Exception { - String file = "ko-KR_(Annyeonghaseyo).mp4"; - //TODO: Check whether output is Annyeonghaseyo or 안녕하세요 - String expected = "Annyeonghaseyo"; - String xml = getXML(file, PARSER).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is pt-BR (Portuguese - Brazil) - */ - @Test - public void testAmazonTranscribeAudio_ptBR() throws Exception { - String file = "pt-BR_(We_Are_At_School).mp3"; - String expected = "nós estamos na escola."; - ParseContext context = new ParseContext(); - context.set(LanguageCode.class, LanguageCode.PtBR); - String xml = getXML(file, PARSER, context).xml; - assertContains(expected, xml); - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is pt-BR (Portuguese - Brazil) - */ - @Test - public void testAmazonTranscribeUnknownAudio_ptBR() throws Exception { - String file = "pt-BR_(We_Are_At_School).mp3"; - String expected = "nós estamos na escola."; - String xml = getXML(file, PARSER).xml; - assertContains(expected, xml); - } - -} diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/tika-config-transcribe-aws.xml b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/tika-config-transcribe-aws.xml deleted file mode 100644 index 875fe5b..0000000 --- a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/tika-config-transcribe-aws.xml +++ /dev/null @@ -1,32 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<properties> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"/> - <parser class="org.apache.tika.parser.transcribe.aws.AmazonTranscribe"> - <params> - <!-- first three are required --> - <param name="bucket" type="string">bucket</param> - <param name="clientId" type="string">clientId</param> - <param name="clientSecret" type="string">clientSecret</param> - <!-- region is optional --> - <param name="region" type="string">region</param> - </params> - </parser> - </parsers> -</properties> \ No newline at end of file diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml b/tika-transcribe/pom.xml similarity index 78% rename from tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml rename to tika-transcribe/pom.xml index 1e287c5..aadb137 100644 --- a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml +++ b/tika-transcribe/pom.xml @@ -25,19 +25,20 @@ <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>tika-parsers-ml</artifactId> <groupId>org.apache.tika</groupId> + <artifactId>tika-parent</artifactId> <version>2.0.0-SNAPSHOT</version> + <relativePath>../tika-parent/pom.xml</relativePath> </parent> - <artifactId>tika-transcribe-aws</artifactId> + <artifactId>tika-transcribe</artifactId> <packaging>bundle</packaging> - <name>Apache Tika transcribe aws</name> + <name>Apache Tika transcribe</name> <url>http://tika.apache.org/</url> <!--TODO use latest aws version or the one defined in the tika-parent--> <dependencies> <dependency> - <groupId>${project.groupId}</groupId> + <groupId>org.apache.tika</groupId> <artifactId>tika-core</artifactId> <version>${project.version}</version> </dependency> @@ -54,37 +55,9 @@ <groupId>commons-codec</groupId> <artifactId>commons-codec</artifactId> </exclusion> - <exclusion> - <groupId>com.fasterxml.jackson.core</groupId> - <artifactId>jackson-core</artifactId> - </exclusion> - <exclusion> - <groupId>com.fasterxml.jackson.core</groupId> - <artifactId>jackson-databind</artifactId> - </exclusion> </exclusions> </dependency> <dependency> - <groupId>com.fasterxml.jackson.core</groupId> - <artifactId>jackson-core</artifactId> - <version>${jackson.version}</version> - </dependency> - <dependency> - <groupId>com.fasterxml.jackson.core</groupId> - <artifactId>jackson-databind</artifactId> - <version>${jackson.version}</version> - </dependency> - <dependency> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - <version>${commons.logging.version}</version> - </dependency> - <dependency> - <groupId>commons-codec</groupId> - <artifactId>commons-codec</artifactId> - <version>${commons.codec.version}</version> - </dependency> - <dependency> <groupId>com.amazonaws</groupId> <artifactId>aws-java-sdk-s3</artifactId> <version>${aws.version}</version> @@ -98,14 +71,6 @@ <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - <scope>test</scope> - <type>test-jar</type> </dependency> </dependencies> <build> @@ -146,7 +111,7 @@ <configuration> <archive> <manifestEntries> - <Automatic-Module-Name>org.apache.tika.parser.transcribe.aws</Automatic-Module-Name> + <Automatic-Module-Name>org.apache.tika.translate</Automatic-Module-Name> </manifestEntries> </archive> </configuration> diff --git a/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java b/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java new file mode 100644 index 0000000..5b50491 --- /dev/null +++ b/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java @@ -0,0 +1,406 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.transcribe; + +import com.amazonaws.AmazonServiceException; +import com.amazonaws.SdkClientException; +import com.amazonaws.auth.AWSStaticCredentialsProvider; +import com.amazonaws.auth.BasicAWSCredentials; +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.AmazonS3ClientBuilder; +import com.amazonaws.services.s3.model.AmazonS3Exception; +import com.amazonaws.services.s3.model.CompressionType; +import com.amazonaws.services.s3.model.ExpressionType; +import com.amazonaws.services.s3.model.InputSerialization; +import com.amazonaws.services.s3.model.JSONInput; +import com.amazonaws.services.s3.model.JSONOutput; +import com.amazonaws.services.s3.model.JSONType; +import com.amazonaws.services.s3.model.OutputSerialization; +import com.amazonaws.services.s3.model.PutObjectRequest; +import com.amazonaws.services.s3.model.PutObjectResult; +import com.amazonaws.services.s3.model.SelectObjectContentEvent; +import com.amazonaws.services.s3.model.SelectObjectContentEventVisitor; +import com.amazonaws.services.s3.model.SelectObjectContentRequest; +import com.amazonaws.services.s3.model.SelectObjectContentResult; +import com.amazonaws.services.transcribe.AmazonTranscribeAsync; +import com.amazonaws.services.transcribe.AmazonTranscribeAsyncClientBuilder; +import com.amazonaws.services.transcribe.model.Media; +import com.amazonaws.services.transcribe.model.StartTranscriptionJobRequest; +import com.amazonaws.services.transcribe.model.TranscriptionJob; +import com.amazonaws.services.transcribe.model.TranscriptionJobStatus; +import com.amazonaws.services.transcribe.model.GetTranscriptionJobRequest; +import com.amazonaws.services.transcribe.model.GetTranscriptionJobResult; +import com.amazonaws.services.transcribe.model.LanguageCode; +import org.apache.tika.exception.TikaException; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.Properties; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.stream.Collectors; + +/** + * <a href="https://aws.amazon.com/transcribe/">Amazon Transcribe</a> + * {@link Transcriber} implementation. See Javadoc for configiration options. + * + * @since Tika 2.1 + */ +public class AmazonTranscribe implements Transcriber { + + public static final String PROPERTIES_FILE = "transcribe.amazon.properties"; + public static final String ID_PROPERTY = "transcribe.AWS_ACCESS_KEY"; + public static final String SECRET_PROPERTY = "transcribe.AWS_SECRET_KEY"; + public static final String DEFAULT_ID = "dummy-id"; + public static final String DEFAULT_SECRET = "dummy-secret"; + public static final String DEFAULT_BUCKET = "dummy-bucket"; + public static final String BUCKET_NAME = "transcribe.BUCKET_NAME"; + public static final String REGION = "transcribe.REGION"; + private static final Logger LOG = LoggerFactory + .getLogger(AmazonTranscribe.class); + private AmazonTranscribeAsync amazonTranscribeAsync; + private AmazonS3 amazonS3; + private String bucketName; + private String region; + private boolean isAvailable; // Flag for whether or not transcription is + // available. + private String clientId; + private String clientSecret; // Keys used for the API calls. + private AWSStaticCredentialsProvider credsProvider; + + /** + * Create a new AmazonTranscribe instance with the client keys specified in + * <code>transcribe.amazon.properties</code> which needs to be available on + * the Java Classpath. + * Silently becomes unavailable when client keys are unavailable. + * <code>transcribe.AWS_ACCESS_KEY</code>, + * <code>transcribe.AWS_SECRET_KEY</code>, + * <code>transcribe.BUCKET_NAME</code> and + * <code>transcribe.REGION</code> must be set in + * <code>transcribe.amazon.properties</code>. + * <b>N.B.</b> it is not necessary to create the bucket before hand. + * This implementation will automatically create the bucket if one + * does not alrerady exist, per the name defined above. + * + * @since Tika 2.0 + */ + public AmazonTranscribe() { + Properties config = new Properties(); + try { + config.load(AmazonTranscribe.class + .getResourceAsStream(PROPERTIES_FILE)); + this.clientId = config.getProperty(ID_PROPERTY); + this.clientSecret = config.getProperty(SECRET_PROPERTY); + this.bucketName = config.getProperty(BUCKET_NAME); + this.region = config.getProperty(REGION); + BasicAWSCredentials creds = new BasicAWSCredentials(this.clientId, + this.clientSecret); + this.credsProvider = new AWSStaticCredentialsProvider(creds); + amazonS3 = AmazonS3ClientBuilder.standard() + .withCredentials(credsProvider).withRegion(this.region) + .build(); + this.isAvailable = checkAvailable(); + if (!this.amazonS3.doesBucketExistV2(this.bucketName)) { + try { + amazonS3.createBucket(this.bucketName); + } catch (AmazonS3Exception e) { + throw new RuntimeException(e.getErrorMessage()); + } + } + this.amazonTranscribeAsync = AmazonTranscribeAsyncClientBuilder + .standard().withCredentials(credsProvider) + .withRegion(this.region).build(); + } catch (Exception e) { + LOG.warn("Exception reading config file", e); + isAvailable = false; + } + } + + /** + * private method to get a unique job key. + * + * @return unique job key. + */ + private String getJobKey() { + return UUID.randomUUID().toString(); + } + + /** + * Constructs a new {@link PutObjectRequest} object to upload a file to the + * specified bucket and jobName. After constructing the request, users may + * optionally specify object metadata or a canned ACL as well. + * + * @param inputStream, null + * The file to upload to Amazon S3. + * @param jobName + * The unique job name for each job(UUID). + */ + private void uploadFileToBucket(InputStream inputStream, String jobName) + throws TikaException { + PutObjectRequest request = new PutObjectRequest(this.bucketName, + jobName, inputStream, null); + try { + @SuppressWarnings("unused") + PutObjectResult response = amazonS3.putObject(request); + } catch (SdkClientException e) { + throw (new TikaException("File Upload to AWS Failed")); + } + } + + /** + * Starts AWS Transcribe Job without language specification. + * + * @param inputStream + * the source input stream. + * @return The transcribed string result, NULL if the job failed. + * @throws TikaException + * When there is an error transcribing. + * @throws IOException + * If an I/O exception of some sort has occurred. + */ + @Override + public String transcribe(InputStream inputStream) + throws TikaException, IOException { + if (!isAvailable()) + return null; + String jobName = getJobKey(); + uploadFileToBucket(inputStream, jobName); + StartTranscriptionJobRequest startTranscriptionJobRequest = new StartTranscriptionJobRequest(); + Media media = new Media(); + media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString()); + startTranscriptionJobRequest.withIdentifyLanguage(true).withMedia(media) + .withOutputBucketName(this.bucketName) + .withTranscriptionJobName(jobName) + .setRequestCredentialsProvider(credsProvider); + amazonTranscribeAsync + .startTranscriptionJob(startTranscriptionJobRequest); + return getTranscriptText(jobName); + } + + /** + * Starts AWS Transcribe Job with language specification. + * + * @param inputStream + * the source input stream. + * @param sourceLanguage + * <a href= + * "https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS + * Language Code</a> for the language used in the input media + * file. + * @return The transcribed string result, NULL if the job failed. + * @throws TikaException + * When there is an error transcribing. + * @throws IOException + * If an I/O exception of some sort has occurred. + * @see <a href= + * "https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS + * Language Code</a> + */ + @Override + public String transcribe(InputStream inputStream, String sourceLanguage) + throws TikaException, IOException { + if (!isAvailable()) + return null; + String jobName = getJobKey(); + uploadFileToBucket(inputStream, jobName); + StartTranscriptionJobRequest startTranscriptionJobRequest = new StartTranscriptionJobRequest(); + Media media = new Media(); + media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString()); + ((StartTranscriptionJobRequest) startTranscriptionJobRequest + .withMedia(media).withOutputBucketName(this.bucketName) + .withTranscriptionJobName(jobName) + .withRequestCredentialsProvider(credsProvider)) + .withLanguageCode( + LanguageCode.fromValue(sourceLanguage)); + amazonTranscribeAsync + .startTranscriptionJob(startTranscriptionJobRequest); + return getTranscriptText(jobName); + } + + /** + * @return true if this Transcriber is probably able to transcribe right + * now. + * @since Tika 2.1 + */ + @Override + public boolean isAvailable() { + return this.isAvailable; + } + + /** + * Sets the client Id for the transcriber API. + * + * @param id + * The ID to set. + */ + public void setId(String id) { + this.clientId = id; + this.isAvailable = checkAvailable(); + } + + /** + * Sets the client secret for the transcriber API. + * + * @param secret + * The secret to set. + */ + public void setSecret(String secret) { + this.clientSecret = secret; + this.isAvailable = checkAvailable(); + } + + /** + * Sets the client secret for the transcriber API. + * + * @param bucket + * The bucket to set. + */ + public void setBucket(String bucket) { + this.bucketName = bucket; + this.isAvailable = checkAvailable(); + } + + /** + * Private method check if the service is available. + * + * @return if the service is available + */ + private boolean checkAvailable() { + return clientId != null && !clientId.equals(DEFAULT_ID) + && clientSecret != null && !clientSecret.equals(DEFAULT_SECRET) + && bucketName != null && !bucketName.equals(DEFAULT_BUCKET); + } + + /** + * Gets Transcription result from AWS S3 bucket given the jobName. + * + * @param fileNameS3 + * The path of the file to upload to Amazon S3. + * @return The transcribed string result, NULL if the job failed. + * @throws IOException possible reasons include (i) an End Event is not received + * from AWS S3 SelectObjectContentResult operation and (ii) a parse exception + * whilst processing JSON from the AWS S3 SelectObjectContentResult operation. + * @throws SdkClientException a AWS-specific exception related to SelectObjectContentResult + * operation. + * @throws AmazonServiceException possibly thrown if there is an issue selecting object content + * from AWS S3 objects. + */ + private String getTranscriptText(String fileNameS3) throws AmazonServiceException, SdkClientException, IOException { + TranscriptionJob transcriptionJob = retrieveObjectWhenJobCompleted( + fileNameS3); + String text = null; + if (transcriptionJob != null && !TranscriptionJobStatus.FAILED.name() + .equals(transcriptionJob.getTranscriptionJobStatus())) { + InputSerialization inputSerialization = new InputSerialization().withJson(new JSONInput().withType(JSONType.DOCUMENT)) + .withCompressionType(CompressionType.NONE); + OutputSerialization outputSerialization = new OutputSerialization().withJson(new JSONOutput()); + SelectObjectContentRequest request = new SelectObjectContentRequest() + .withBucketName(this.bucketName).withKey(fileNameS3 + ".json") + .withExpression("Select s.results.transcripts[0].transcript from S3Object s")//WHERE transcript IS NOT MISSING + .withExpressionType(ExpressionType.SQL).withRequestCredentialsProvider(credsProvider); + request.setInputSerialization(inputSerialization); + request.setOutputSerialization(outputSerialization); + + final AtomicBoolean isResultComplete = new AtomicBoolean(false); + + try (SelectObjectContentResult result = amazonS3 + .selectObjectContent(request)) { + InputStream resultInputStream = result.getPayload() + .getRecordsInputStream( + new SelectObjectContentEventVisitor() { + @Override + public void visit( + SelectObjectContentEvent.StatsEvent event) { + LOG.debug( + "Received Stats, Bytes Scanned: " + + event.getDetails() + .getBytesScanned() + + " Bytes Processed: " + + event.getDetails() + .getBytesProcessed()); + } + + /* + * An End Event informs that the request has + * finished successfully. + */ + @Override + public void visit( + SelectObjectContentEvent.EndEvent event) { + isResultComplete.set(true); + LOG.debug( + "Received End Event. Result is complete."); + } + }); + text = new BufferedReader( + new InputStreamReader(resultInputStream, StandardCharsets.UTF_8)) + .lines() + .collect(Collectors.joining("\n")); + } + /* + * The End Event indicates all matching records have been + * transmitted. If the End Event is not received, the results + * may be incomplete. + */ + if (!isResultComplete.get()) { + throw new IOException( + "S3 Select request was incomplete as End Event was not received."); + } + } + JSONParser parser = new JSONParser(); + JSONObject obj = null; + try { + obj = (JSONObject) parser.parse(text); + } catch (ParseException e) { + throw new IOException(e.getMessage(), e); + } + return obj.get("transcript").toString(); + } + + /** + * Private helper function to get object from s3. + * + * @param jobName + * The unique job name for each job(UUID). + * @return TranscriptionJob object + */ + private TranscriptionJob retrieveObjectWhenJobCompleted(String jobName) { + GetTranscriptionJobRequest getTranscriptionJobRequest = new GetTranscriptionJobRequest(); + getTranscriptionJobRequest + .withRequestCredentialsProvider(credsProvider); + getTranscriptionJobRequest.setTranscriptionJobName(jobName); + while (true) { + GetTranscriptionJobResult innerResult = amazonTranscribeAsync + .getTranscriptionJob(getTranscriptionJobRequest); + String status = innerResult.getTranscriptionJob() + .getTranscriptionJobStatus(); + if (TranscriptionJobStatus.COMPLETED.name().equals(status) + || TranscriptionJobStatus.FAILED.name().equals(status)) { + return innerResult.getTranscriptionJob(); + } + } + } +} \ No newline at end of file diff --git a/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator b/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator new file mode 100644 index 0000000..1256ab6 --- /dev/null +++ b/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.tika.language.translate.amazontranscribe diff --git a/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties b/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties new file mode 100644 index 0000000..043a66f --- /dev/null +++ b/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +transcribe.AWS_ACCESS_KEY=dummy_key +transcribe.AWS_SECRET_KEY=dummy_key +transcribe.BUCKET_NAME=dummy_name diff --git a/tika-transcribe/src/test/java/org/apache/tika/transcribe/AmazonTranscribeTest.java b/tika-transcribe/src/test/java/org/apache/tika/transcribe/AmazonTranscribeTest.java new file mode 100644 index 0000000..3b424f9 --- /dev/null +++ b/tika-transcribe/src/test/java/org/apache/tika/transcribe/AmazonTranscribeTest.java @@ -0,0 +1,527 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.transcribe; + +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; + +import java.io.FileInputStream; + +import static junit.framework.TestCase.assertNotNull; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +//TODO: Check the ACTUAL output of Amazon Transcribe. + +/** + * Tests tika-trancribe by creating an AmazonTranscribe() object. + * 1) Tests that transcribe functions properly when it is given just a filepath. + * 2) Both audio (mp3) and video (mp4) files are used in these tests. + */ +@Ignore("Ignore until finalize AmazonTransribe Interface & build Tika") +public class AmazonTranscribeTest { + AmazonTranscribe transcriber; + + @Before + public void setUp() { + transcriber = new AmazonTranscribe(); + } + + /** + * Tests transcribe with an audio file given the source language + * The source language of the file is en-US (English - United States) + */ + @Test + public void testAmazonTranscribeAudio_enUS() { + String audioFilePath = "src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3"; + String expected = "a little bottle of water."; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath), "en-US"); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file without passing in the source language. + * The source language of the file is en-US (English - United States) + */ + @Test + public void testAmazonTranscribeUnknownAudio_enUS() { + String audioFilePath = "src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3"; + String expected = "a little bottle of water."; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath)); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file given the source language + * The source language of the file is en-US (English - United States) + */ + @Test + public void testAmazonTranscribeVideo_enUS() { + String videoFilePath = "en-US_(Hi).mp4"; + String expected = "Hi"; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(videoFilePath), "en-US"); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with a video file without passing in the source language. + * The source language of the file is en-US (English - United States) + */ + @Test + public void testAmazonTranscribeUnknownVideo_enUS() { + String videoFilePath = "en-US_(Hi).mp4"; + String expected = "Hi"; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(videoFilePath)); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file given the source language + * The source language of the file is en-GB (English - Great Britain) + */ + @Test + public void testAmazonTranscribeAudio_enGB() { + String audioFilePath = "src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3"; + String expected = "a little bottle of water."; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath), "en-GB"); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file without passing in the source language. + * The source language of the file is en-GB (English - Great Britain) + */ + @Test + public void testAmazonTranscribeUnknownAudio_enGB() { + String audioFilePath = "src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3"; + String expected = "a little bottle of water."; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath)); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file given the source language + * The source language of the file is en-AU (English - Australia) + */ + @Test + public void testAmazonTranscribeAudio_enAU() { + String source = "src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3"; + String expected = "a little bottle of water."; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(source), "en-AU"); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file without passing in the source language. + * The source language of the file is en-AU (English - Australian) + */ + @Test + public void testAmazonTranscribeUnknownAudio_enAU() { + String videoFilePath = "src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3"; + String expected = "a little bottle of water."; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(videoFilePath)); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file given the source language + * The source language of the file is de-DE (German) + */ + @Test + public void testAmazonTranscribeAudio_deDE() { + String audioFilePath = "src/test/resources/de-DE_(We_Are_At_School_x2).mp3"; + String expected = "Wir sind in der Schule. Wir sind in der Schule."; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath), "de-DE"); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file without passing in the source language. + * The source language of the file is de-DE (German) + */ + @Test + public void testAmazonTranscribeUnknownAudio_deDE() { + String audioFilePath = "src/test/resources/de-DE_(We_Are_At_School_x2).mp3"; + String expected = "Wir sind in der Schule. Wir sind in der Schule."; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath)); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file given the source language + * The source language of the file is it-IT (Italian) + */ + @Test + public void testAmazonTranscribeAudio_itIT() { + String audioFilePath = "src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3"; + String expected = "stiamo facendo lezione. stiamo facendo lezione."; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath), "it-IT"); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file without passing in the source language. + * The source language of the file is it-IT (Italian) + */ + @Test + public void testAmazonTranscribeUnknownAudio_itIT() { + String audioFilePath = "src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3"; + String expected = "stiamo facendo lezione. stiamo facendo lezione."; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath)); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file given the source language + * The source language of the file is ja-JP (Japanese) + */ + @Test + public void testAmazonTranscribeAudio_jaJP() { + String audioFilePath = "src/test/resources/ja-JP_(We_Are_At_School).mp3"; + String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath), "ja-JP"); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file without passing in the source language. + * The source language of the file is ja-JP (Japanese) + */ + @Test + public void testAmazonTranscribeUnknownAudio_jaJP() { + String audioFilePath = "src/test/resources/ja-JP_(We_Are_At_School).mp3"; + String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath)); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file given the source language + * The source language of the file is ko-KR (Korean) + */ + @Test + public void testAmazonTranscribeAudio_koKR() { + String audioFilePath = "src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3"; + String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath), "ko-KR"); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file without passing in the source language. + * The source language of the file is ko-KR (Korean) + */ + @Test + public void testAmazonTranscribeUnknownAudio_koKR() { + String audioFilePath = "src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3"; + String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath)); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with a video file given the source language + * The source language of the file is ko-KR (Korean) + */ + @Test + public void testAmazonTranscribeVideo_koKR() { + String source = "src/test/resources/ko-KR_(Annyeonghaseyo).mp4"; + //TODO: Check whether output is Annyeonghaseyo or 안녕하세요 + String expected = "Annyeonghaseyo"; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(source), "ko-KR"); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an video file without passing in the source language. + * The source language of the file is ko-KR (Korean) + */ + @Test + public void testAmazonTranscribeUnknownVideo_koKR() { + String source = "src/test/resources/ko-KR_(Annyeonghaseyo).mp4"; + //TODO: Check whether output is Annyeonghaseyo or 안녕하세요 + String expected = "Annyeonghaseyo"; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(source)); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file given the source language + * The source language of the file is pt-BR (Portuguese - Brazil) + */ + @Test + public void testAmazonTranscribeAudio_ptBR() { + String audioFilePath = "src/test/resources/pt-BR_(We_Are_At_School).mp3"; + String expected = "nós estamos na escola."; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath), "pt-BR"); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + + /** + * Tests transcribe with an audio file without passing in the source language. + * The source language of the file is pt-BR (Portuguese - Brazil) + */ + @Test + public void testAmazonTranscribeUnknownAudio_ptBR() { + String audioFilePath = "src/test/resources/pt-BR_(We_Are_At_School).mp3"; + String expected = "nós estamos na escola."; + String result; + + if (transcriber.isAvailable()) { + try { + result = transcriber.transcribe(new FileInputStream(audioFilePath)); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + +} diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ShortAudioSampleFrench.mp3 b/tika-transcribe/src/test/resources/ShortAudioSampleFrench.mp3 similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ShortAudioSampleFrench.mp3 rename to tika-transcribe/src/test/resources/ShortAudioSampleFrench.mp3 diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/de-DE_(We_Are_At_School_x2).mp3 b/tika-transcribe/src/test/resources/de-DE_(We_Are_At_School_x2).mp3 similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/de-DE_(We_Are_At_School_x2).mp3 rename to tika-transcribe/src/test/resources/de-DE_(We_Are_At_School_x2).mp3 diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-AU_(A_Little_Bottle_Of_Water).mp3 b/tika-transcribe/src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3 similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-AU_(A_Little_Bottle_Of_Water).mp3 rename to tika-transcribe/src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3 diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-GB_(A_Little_Bottle_Of_Water).mp3 b/tika-transcribe/src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3 similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-GB_(A_Little_Bottle_Of_Water).mp3 rename to tika-transcribe/src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3 diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-US_(A_Little_Bottle_Of_Water).mp3 b/tika-transcribe/src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3 similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-US_(A_Little_Bottle_Of_Water).mp3 rename to tika-transcribe/src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3 diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-US_(Hi).mp4 b/tika-transcribe/src/test/resources/en-US_(Hi).mp4 similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/en-US_(Hi).mp4 rename to tika-transcribe/src/test/resources/en-US_(Hi).mp4 diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/it-IT_(We_Are_Having_Class_x2).mp3 b/tika-transcribe/src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3 similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/it-IT_(We_Are_Having_Class_x2).mp3 rename to tika-transcribe/src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3 diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ja-JP_(We_Are_At_School).mp3 b/tika-transcribe/src/test/resources/ja-JP_(We_Are_At_School).mp3 similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ja-JP_(We_Are_At_School).mp3 rename to tika-transcribe/src/test/resources/ja-JP_(We_Are_At_School).mp3 diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ko-KR_(Annyeonghaseyo).mp4 b/tika-transcribe/src/test/resources/ko-KR_(Annyeonghaseyo).mp4 similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ko-KR_(Annyeonghaseyo).mp4 rename to tika-transcribe/src/test/resources/ko-KR_(Annyeonghaseyo).mp4 diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ko-KR_(We_Are_Having_Class_x2).mp3 b/tika-transcribe/src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3 similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/ko-KR_(We_Are_Having_Class_x2).mp3 rename to tika-transcribe/src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3 diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/pt-BR_(We_Are_At_School).mp3 b/tika-transcribe/src/test/resources/pt-BR_(We_Are_At_School).mp3 similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/test-documents/pt-BR_(We_Are_At_School).mp3 rename to tika-transcribe/src/test/resources/pt-BR_(We_Are_At_School).mp3
