This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4be6a5008afc02293453b68fc8149e1b7573be5f Author: tallison <[email protected]> AuthorDate: Tue May 18 09:25:48 2021 -0400 TIKA-3384 -- convert transcribe to a traditional parser, step 2; make changes --- .../org/apache/tika/transcribe/Transcriber.java | 60 --- .../tika/example/TranscribeTranslateExample.java | 83 ++-- .../tika-parsers-ml/tika-transcribe-aws/pom.xml | 44 +- .../parser/transcribe/aws/AmazonTranscribe.java | 460 ++++++++++----------- .../transcribe/aws/AmazonTranscribeTest.java | 447 ++++++-------------- .../test/resources/tika-config-aws-transcribe.xml | 29 ++ .../org.apache.tika.language.translate.Translator | 16 - .../transcribe.amazon.properties | 18 - 8 files changed, 458 insertions(+), 699 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java b/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java deleted file mode 100644 index 3546256..0000000 --- a/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.transcribe; - -import java.io.IOException; -import java.io.InputStream; - -import org.apache.tika.exception.TikaException; - -/** - * Interface for Transcriber services. - * - * @see <a href="https://issues.apache.org/jira/browse/TIKA-94">TIKA-94</a> - * @since Tika 2.1 - */ -public interface Transcriber { - /** - * Transcribe the given file. - * - * @param inputStream the source input stream. - * @return The transcribed string result, NULL if the job failed. - * @throws TikaException When there is an error transcribing. - * @throws IOException If an I/O exception of some sort has occurred. - * @since 2.1 - */ - public String transcribe(InputStream inputStream) throws TikaException, IOException; - - /** - * Transcribe the given the file and the source language. - * - * @param inputStream the source input stream. - * @param sourceLanguage The language code for the language used in the input media file. - * @return The transcribed string result, NULL if the job failed. - * @throws TikaException When there is an error transcribing. - * @throws IOException If an I/O exception of some sort has occurred. - * @since 2.1 - */ - public String transcribe(InputStream inputStream, String sourceLanguage) throws TikaException, IOException; - - /** - * @return true if this Transcriber is probably able to transcribe right now. - * @since Tika 2.1 - */ - public boolean isAvailable(); -} diff --git a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java index 12dd7e5..a90d322 100644 --- a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java @@ -17,22 +17,23 @@ package org.apache.tika.example; -import java.io.FileInputStream; +import java.nio.file.Path; +import java.nio.file.Paths; +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; import org.apache.tika.language.translate.GoogleTranslator; import org.apache.tika.language.translate.Translator; -import org.apache.tika.transcribe.AmazonTranscribe; -import org.apache.tika.transcribe.Transcriber; /** * This example demonstrates primitive logic for * chaining Tika API calls. In this case translation - * could be considered as a downstream process to + * could be considered as a downstream process to * transcription. * We simply pass the output of - * a call to {@link Transcriber#transcribe(java.io.InputStream)} - * into {@link Translator#translate(String, String)}. - * The {@link GoogleTranslator} is configured with a target + * a call to {@link Tika#parseToString(Path)} + * into {@link Translator#translate(String, String)}. + * The {@link GoogleTranslator} is configured with a target * language of "en-US". * @author lewismc * @@ -42,7 +43,7 @@ public class TranscribeTranslateExample { /** * Use {@link GoogleTranslator} to execute translation on * input data. This implementation needs configured as explained in the Javadoc. - * In this implementation, Google will try to guess the input language. The target + * In this implementation, Google will try to guess the input language. The target * language is "en-US". * @param text input text to translate. * @return translated text String. @@ -61,43 +62,55 @@ public class TranscribeTranslateExample { } /** - * Use {@link AmazonTranscribe} to execute transcription on input data. - * This implementation needs configured as explained in the Javadoc. + * Use {@link org.apache.tika.parser.transcribe.aws.AmazonTranscribe} to execute transcription + * on input data. + * This implementation needs to be configured as explained in the Javadoc. * @param file the name of the file (which needs to be on the Java Classpath) to transcribe. * @return transcribed text. */ - public static String amazonTranscribe(String file) { - String filePath = TranscribeTranslateExample.class.getClassLoader().getResource(file).getPath(); - String result = null; - Transcriber transcriber = new AmazonTranscribe(); - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(filePath)); - } catch (Exception e) { - e.printStackTrace(); - } - } - return result; + public static String amazonTranscribe(Path tikaConfig, Path file) throws Exception { + return new Tika(new TikaConfig(tikaConfig)).parseToString(file); } /** * Main method to run this example. This program can be invoked as follows * <ol> - * <li><code>transcribe-translate ${file}</code>; which executes both - * transcription then translation on the given resource, or - * <li><code>transcribe ${file}</code>; which executes only translation</li> - * @param args either of the commands described above and the input file - * (which needs to be on the Java Classpath). + * <li><code>transcribe-translate ${tika-config.xml} ${file}</code>; which executes both + * transcription then translation on the given resource, or + * <li><code>transcribe ${tika-config.xml} ${file}</code>; which executes only translation</li> + * @param args either of the commands described above and the input file + * (which needs to be on the Java Classpath). + * + * + * + * ${tika-config.xml} must include credentials for aws and a temporary storage bucket: + * <pre> + * {@code + * <properties> + * <parsers> + * <parser class="org.apache.tika.parser.DefaultParser"/> + * <parser class="org.apache.tika.parser.transcribe.aws.AmazonTranscribe"> + * <params> + * <param name="bucket" type="string">bucket</param> + * <param name="clientId" type="string">clientId</param> + * <param name="clientSecret" type="string">clientSecret</param> + * </params> + * </parser> + * </parsers> + * </properties> + * } + * </pre> */ - public static void main (String[] args) { + public static void main (String[] args) throws Exception { String text = null; - if (args.length != 0) { - if ("transcribe-translate".equals(args[0])) { - text = googleTranslateToEnglish(amazonTranscribe(args[1])); - System.out.print("Transcription and translation successful!\nEXTRAXCTED TEXT: " + text); - } else if ("transcribe".equals(args[0])) { - text = amazonTranscribe(args[1]); - System.out.print("Transcription successful!\nEXTRAXCTED TEXT: " + text); + if (args.length > 1) { + if ("transcribe-translate".equals(args[1])) { + text = googleTranslateToEnglish(amazonTranscribe(Paths.get(args[0]), + Paths.get(args[1]))); + System.out.print("Transcription and translation successful!\nEXTRACTED TEXT: " + text); + } else if ("transcribe".equals(args[1])) { + text = amazonTranscribe(Paths.get(args[0]), Paths.get(args[1])); + System.out.print("Transcription successful!\nEXTRACTED TEXT: " + text); } else { System.out.print("Incorrect invocation, see Javadoc."); } diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml index 2170f8c..1e287c5 100644 --- a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml +++ b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/pom.xml @@ -25,19 +25,19 @@ <modelVersion>4.0.0</modelVersion> <parent> - <groupId>org.apache.tika</groupId> <artifactId>tika-parsers-ml</artifactId> + <groupId>org.apache.tika</groupId> <version>2.0.0-SNAPSHOT</version> </parent> <artifactId>tika-transcribe-aws</artifactId> <packaging>bundle</packaging> - <name>Apache Tika transcribe</name> + <name>Apache Tika transcribe aws</name> <url>http://tika.apache.org/</url> <!--TODO use latest aws version or the one defined in the tika-parent--> <dependencies> <dependency> - <groupId>org.apache.tika</groupId> + <groupId>${project.groupId}</groupId> <artifactId>tika-core</artifactId> <version>${project.version}</version> </dependency> @@ -54,9 +54,37 @@ <groupId>commons-codec</groupId> <artifactId>commons-codec</artifactId> </exclusion> + <exclusion> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-core</artifactId> + </exclusion> + <exclusion> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + </exclusion> </exclusions> </dependency> <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-core</artifactId> + <version>${jackson.version}</version> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + <version>${jackson.version}</version> + </dependency> + <dependency> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + <version>${commons.logging.version}</version> + </dependency> + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + <version>${commons.codec.version}</version> + </dependency> + <dependency> <groupId>com.amazonaws</groupId> <artifactId>aws-java-sdk-s3</artifactId> <version>${aws.version}</version> @@ -70,6 +98,14 @@ <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <scope>test</scope> + <type>test-jar</type> </dependency> </dependencies> <build> @@ -110,7 +146,7 @@ <configuration> <archive> <manifestEntries> - <Automatic-Module-Name>org.apache.tika.translate</Automatic-Module-Name> + <Automatic-Module-Name>org.apache.tika.parser.transcribe.aws</Automatic-Module-Name> </manifestEntries> </archive> </configuration> diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java index 5b50491..91e8452 100644 --- a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java +++ b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java @@ -15,7 +15,21 @@ * limitations under the License. */ -package org.apache.tika.transcribe; +package org.apache.tika.parser.transcribe.aws; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.stream.Collectors; import com.amazonaws.AmazonServiceException; import com.amazonaws.SdkClientException; @@ -39,48 +53,48 @@ import com.amazonaws.services.s3.model.SelectObjectContentRequest; import com.amazonaws.services.s3.model.SelectObjectContentResult; import com.amazonaws.services.transcribe.AmazonTranscribeAsync; import com.amazonaws.services.transcribe.AmazonTranscribeAsyncClientBuilder; +import com.amazonaws.services.transcribe.model.GetTranscriptionJobRequest; +import com.amazonaws.services.transcribe.model.GetTranscriptionJobResult; +import com.amazonaws.services.transcribe.model.LanguageCode; import com.amazonaws.services.transcribe.model.Media; import com.amazonaws.services.transcribe.model.StartTranscriptionJobRequest; import com.amazonaws.services.transcribe.model.TranscriptionJob; import com.amazonaws.services.transcribe.model.TranscriptionJobStatus; -import com.amazonaws.services.transcribe.model.GetTranscriptionJobRequest; -import com.amazonaws.services.transcribe.model.GetTranscriptionJobResult; -import com.amazonaws.services.transcribe.model.LanguageCode; -import org.apache.tika.exception.TikaException; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.json.simple.parser.ParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; -import java.util.Properties; -import java.util.UUID; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.stream.Collectors; +import org.apache.tika.config.Field; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; /** - * <a href="https://aws.amazon.com/transcribe/">Amazon Transcribe</a> - * {@link Transcriber} implementation. See Javadoc for configiration options. + * <a href="https://aws.amazon.com/transcribe/">Amazon Transcribe</a> + * implementation. See Javadoc for configuration options. + * <p> + * Silently becomes unavailable when client keys are unavailable. * - * @since Tika 2.1 + * <b>N.B.</b> it is not necessary to create the bucket before hand. + * This implementation will automatically create the bucket if one + * does not already exist, per the name defined above. + * + * @since Tika 2.0 */ -public class AmazonTranscribe implements Transcriber { - - public static final String PROPERTIES_FILE = "transcribe.amazon.properties"; - public static final String ID_PROPERTY = "transcribe.AWS_ACCESS_KEY"; - public static final String SECRET_PROPERTY = "transcribe.AWS_SECRET_KEY"; - public static final String DEFAULT_ID = "dummy-id"; - public static final String DEFAULT_SECRET = "dummy-secret"; - public static final String DEFAULT_BUCKET = "dummy-bucket"; - public static final String BUCKET_NAME = "transcribe.BUCKET_NAME"; - public static final String REGION = "transcribe.REGION"; - private static final Logger LOG = LoggerFactory - .getLogger(AmazonTranscribe.class); + +public class AmazonTranscribe extends AbstractParser implements Initializable { + private static final Logger LOG = LoggerFactory.getLogger(AmazonTranscribe.class); private AmazonTranscribeAsync amazonTranscribeAsync; private AmazonS3 amazonS3; private String bucketName; @@ -91,161 +105,74 @@ public class AmazonTranscribe implements Transcriber { private String clientSecret; // Keys used for the API calls. private AWSStaticCredentialsProvider credsProvider; - /** - * Create a new AmazonTranscribe instance with the client keys specified in - * <code>transcribe.amazon.properties</code> which needs to be available on - * the Java Classpath. - * Silently becomes unavailable when client keys are unavailable. - * <code>transcribe.AWS_ACCESS_KEY</code>, - * <code>transcribe.AWS_SECRET_KEY</code>, - * <code>transcribe.BUCKET_NAME</code> and - * <code>transcribe.REGION</code> must be set in - * <code>transcribe.amazon.properties</code>. - * <b>N.B.</b> it is not necessary to create the bucket before hand. - * This implementation will automatically create the bucket if one - * does not alrerady exist, per the name defined above. - * - * @since Tika 2.0 - */ - public AmazonTranscribe() { - Properties config = new Properties(); - try { - config.load(AmazonTranscribe.class - .getResourceAsStream(PROPERTIES_FILE)); - this.clientId = config.getProperty(ID_PROPERTY); - this.clientSecret = config.getProperty(SECRET_PROPERTY); - this.bucketName = config.getProperty(BUCKET_NAME); - this.region = config.getProperty(REGION); - BasicAWSCredentials creds = new BasicAWSCredentials(this.clientId, - this.clientSecret); - this.credsProvider = new AWSStaticCredentialsProvider(creds); - amazonS3 = AmazonS3ClientBuilder.standard() - .withCredentials(credsProvider).withRegion(this.region) - .build(); - this.isAvailable = checkAvailable(); - if (!this.amazonS3.doesBucketExistV2(this.bucketName)) { - try { - amazonS3.createBucket(this.bucketName); - } catch (AmazonS3Exception e) { - throw new RuntimeException(e.getErrorMessage()); - } - } - this.amazonTranscribeAsync = AmazonTranscribeAsyncClientBuilder - .standard().withCredentials(credsProvider) - .withRegion(this.region).build(); - } catch (Exception e) { - LOG.warn("Exception reading config file", e); - isAvailable = false; - } - } - - /** - * private method to get a unique job key. - * - * @return unique job key. - */ - private String getJobKey() { - return UUID.randomUUID().toString(); - } + //https://docs.aws.amazon.com/transcribe/latest/dg/input.html + protected static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( + new HashSet<>(Arrays.asList(MediaType.audio("x-flac"), MediaType.audio("mp3"), + MediaType.audio("mpeg"), MediaType.video("ogg"), MediaType.audio("vnd.wave"), + MediaType.audio("mp4"), MediaType.video("mp4"), MediaType.application("mp4"), + MediaType.video("quicktime")))); - /** - * Constructs a new {@link PutObjectRequest} object to upload a file to the - * specified bucket and jobName. After constructing the request, users may - * optionally specify object metadata or a canned ACL as well. - * - * @param inputStream, null - * The file to upload to Amazon S3. - * @param jobName - * The unique job name for each job(UUID). - */ - private void uploadFileToBucket(InputStream inputStream, String jobName) - throws TikaException { - PutObjectRequest request = new PutObjectRequest(this.bucketName, - jobName, inputStream, null); - try { - @SuppressWarnings("unused") - PutObjectResult response = amazonS3.putObject(request); - } catch (SdkClientException e) { - throw (new TikaException("File Upload to AWS Failed")); - } - } - /** - * Starts AWS Transcribe Job without language specification. - * - * @param inputStream - * the source input stream. - * @return The transcribed string result, NULL if the job failed. - * @throws TikaException - * When there is an error transcribing. - * @throws IOException - * If an I/O exception of some sort has occurred. - */ @Override - public String transcribe(InputStream inputStream) - throws TikaException, IOException { - if (!isAvailable()) - return null; - String jobName = getJobKey(); - uploadFileToBucket(inputStream, jobName); - StartTranscriptionJobRequest startTranscriptionJobRequest = new StartTranscriptionJobRequest(); - Media media = new Media(); - media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString()); - startTranscriptionJobRequest.withIdentifyLanguage(true).withMedia(media) - .withOutputBucketName(this.bucketName) - .withTranscriptionJobName(jobName) - .setRequestCredentialsProvider(credsProvider); - amazonTranscribeAsync - .startTranscriptionJob(startTranscriptionJobRequest); - return getTranscriptText(jobName); + public Set<MediaType> getSupportedTypes(ParseContext context) { + if (!isAvailable) { + return Collections.EMPTY_SET; + } + return SUPPORTED_TYPES; } /** * Starts AWS Transcribe Job with language specification. * - * @param inputStream - * the source input stream. - * @param sourceLanguage - * <a href= - * "https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS - * Language Code</a> for the language used in the input media - * file. - * @return The transcribed string result, NULL if the job failed. - * @throws TikaException - * When there is an error transcribing. - * @throws IOException - * If an I/O exception of some sort has occurred. + * @param stream the source input stream. + * @param handler handler to use + * @param metadata + * @param context -- set the {@link LanguageCode} in the ParseContext if known + * @throws TikaException When there is an error transcribing. + * @throws IOException If an I/O exception of some sort has occurred. * @see <a href= - * "https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS - * Language Code</a> + * "https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS + * Language Code</a> */ @Override - public String transcribe(InputStream inputStream, String sourceLanguage) - throws TikaException, IOException { - if (!isAvailable()) - return null; + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + + if (!isAvailable) { + return; + } String jobName = getJobKey(); - uploadFileToBucket(inputStream, jobName); - StartTranscriptionJobRequest startTranscriptionJobRequest = new StartTranscriptionJobRequest(); + LanguageCode languageCode = context.get(LanguageCode.class); + uploadFileToBucket(stream, jobName); + StartTranscriptionJobRequest startTranscriptionJobRequest = + new StartTranscriptionJobRequest(); Media media = new Media(); media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString()); - ((StartTranscriptionJobRequest) startTranscriptionJobRequest - .withMedia(media).withOutputBucketName(this.bucketName) - .withTranscriptionJobName(jobName) - .withRequestCredentialsProvider(credsProvider)) - .withLanguageCode( - LanguageCode.fromValue(sourceLanguage)); - amazonTranscribeAsync - .startTranscriptionJob(startTranscriptionJobRequest); - return getTranscriptText(jobName); + startTranscriptionJobRequest.withMedia(media).withOutputBucketName(this.bucketName) + .withTranscriptionJobName(jobName).setRequestCredentialsProvider(credsProvider); + + if (languageCode != null) { + startTranscriptionJobRequest.withLanguageCode(languageCode); + } else { + startTranscriptionJobRequest.withIdentifyLanguage(true); + } + amazonTranscribeAsync.startTranscriptionJob(startTranscriptionJobRequest); + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + String text = getTranscriptText(jobName); + xhtml.startElement("p"); + xhtml.characters(text); + xhtml.endElement("p"); + xhtml.endDocument(); + } + /** * @return true if this Transcriber is probably able to transcribe right - * now. + * now. * @since Tika 2.1 */ - @Override public boolean isAvailable() { return this.isAvailable; } @@ -253,10 +180,10 @@ public class AmazonTranscribe implements Transcriber { /** * Sets the client Id for the transcriber API. * - * @param id - * The ID to set. + * @param id The ID to set. */ - public void setId(String id) { + @Field + public void setClientId(String id) { this.clientId = id; this.isAvailable = checkAvailable(); } @@ -264,10 +191,10 @@ public class AmazonTranscribe implements Transcriber { /** * Sets the client secret for the transcriber API. * - * @param secret - * The secret to set. + * @param secret The secret to set. */ - public void setSecret(String secret) { + @Field + public void setClientSecret(String secret) { this.clientSecret = secret; this.isAvailable = checkAvailable(); } @@ -275,89 +202,116 @@ public class AmazonTranscribe implements Transcriber { /** * Sets the client secret for the transcriber API. * - * @param bucket - * The bucket to set. + * @param bucket The bucket to set. */ + @Field public void setBucket(String bucket) { this.bucketName = bucket; this.isAvailable = checkAvailable(); } + @Field + public void setRegion(String region) { + this.region = region; + this.isAvailable = checkAvailable(); + } + /** * Private method check if the service is available. * * @return if the service is available */ private boolean checkAvailable() { - return clientId != null && !clientId.equals(DEFAULT_ID) - && clientSecret != null && !clientSecret.equals(DEFAULT_SECRET) - && bucketName != null && !bucketName.equals(DEFAULT_BUCKET); + return clientId != null && clientSecret != null && bucketName != null; + } + + /** + * private method to get a unique job key. + * + * @return unique job key. + */ + private String getJobKey() { + return UUID.randomUUID().toString(); + } + + /** + * Constructs a new {@link PutObjectRequest} object to upload a file to the + * specified bucket and jobName. After constructing the request, users may + * optionally specify object metadata or a canned ACL as well. + * + * @param inputStream, null + * The file to upload to Amazon S3. + * @param jobName The unique job name for each job(UUID). + */ + private void uploadFileToBucket(InputStream inputStream, String jobName) throws TikaException { + PutObjectRequest request = + new PutObjectRequest(this.bucketName, jobName, inputStream, null); + try { + @SuppressWarnings("unused") PutObjectResult response = amazonS3.putObject(request); + } catch (SdkClientException e) { + throw (new TikaException("File Upload to AWS Failed")); + } } /** * Gets Transcription result from AWS S3 bucket given the jobName. * - * @param fileNameS3 - * The path of the file to upload to Amazon S3. + * @param fileNameS3 The path of the file to upload to Amazon S3. * @return The transcribed string result, NULL if the job failed. - * @throws IOException possible reasons include (i) an End Event is not received - * from AWS S3 SelectObjectContentResult operation and (ii) a parse exception - * whilst processing JSON from the AWS S3 SelectObjectContentResult operation. - * @throws SdkClientException a AWS-specific exception related to SelectObjectContentResult - * operation. + * @throws IOException possible reasons include (i) an End Event is not received + * from AWS S3 SelectObjectContentResult operation and (ii) a parse exception + * whilst processing JSON from the AWS S3 SelectObjectContentResult operation. + * @throws SdkClientException a AWS-specific exception related to SelectObjectContentResult + * operation. * @throws AmazonServiceException possibly thrown if there is an issue selecting object content - * from AWS S3 objects. + * from AWS S3 objects. */ - private String getTranscriptText(String fileNameS3) throws AmazonServiceException, SdkClientException, IOException { - TranscriptionJob transcriptionJob = retrieveObjectWhenJobCompleted( - fileNameS3); + private String getTranscriptText(String fileNameS3) + throws AmazonServiceException, SdkClientException, IOException { + TranscriptionJob transcriptionJob = retrieveObjectWhenJobCompleted(fileNameS3); String text = null; if (transcriptionJob != null && !TranscriptionJobStatus.FAILED.name() .equals(transcriptionJob.getTranscriptionJobStatus())) { - InputSerialization inputSerialization = new InputSerialization().withJson(new JSONInput().withType(JSONType.DOCUMENT)) - .withCompressionType(CompressionType.NONE); - OutputSerialization outputSerialization = new OutputSerialization().withJson(new JSONOutput()); - SelectObjectContentRequest request = new SelectObjectContentRequest() - .withBucketName(this.bucketName).withKey(fileNameS3 + ".json") - .withExpression("Select s.results.transcripts[0].transcript from S3Object s")//WHERE transcript IS NOT MISSING - .withExpressionType(ExpressionType.SQL).withRequestCredentialsProvider(credsProvider); + InputSerialization inputSerialization = + new InputSerialization().withJson(new JSONInput().withType(JSONType.DOCUMENT)) + .withCompressionType(CompressionType.NONE); + OutputSerialization outputSerialization = + new OutputSerialization().withJson(new JSONOutput()); + SelectObjectContentRequest request = + new SelectObjectContentRequest().withBucketName(this.bucketName) + .withKey(fileNameS3 + ".json").withExpression( + "Select s.results.transcripts[0].transcript from S3Object s") + //WHERE transcript IS NOT MISSING + .withExpressionType(ExpressionType.SQL) + .withRequestCredentialsProvider(credsProvider); request.setInputSerialization(inputSerialization); request.setOutputSerialization(outputSerialization); final AtomicBoolean isResultComplete = new AtomicBoolean(false); - try (SelectObjectContentResult result = amazonS3 - .selectObjectContent(request)) { + try (SelectObjectContentResult result = amazonS3.selectObjectContent(request)) { InputStream resultInputStream = result.getPayload() - .getRecordsInputStream( - new SelectObjectContentEventVisitor() { - @Override - public void visit( - SelectObjectContentEvent.StatsEvent event) { - LOG.debug( - "Received Stats, Bytes Scanned: " - + event.getDetails() - .getBytesScanned() - + " Bytes Processed: " - + event.getDetails() - .getBytesProcessed()); - } - - /* - * An End Event informs that the request has - * finished successfully. - */ - @Override - public void visit( - SelectObjectContentEvent.EndEvent event) { - isResultComplete.set(true); - LOG.debug( - "Received End Event. Result is complete."); - } - }); + .getRecordsInputStream(new SelectObjectContentEventVisitor() { + @Override + public void visit(SelectObjectContentEvent.StatsEvent event) { + LOG.debug("Received Stats, Bytes Scanned: " + + event.getDetails().getBytesScanned() + + " Bytes Processed: " + + event.getDetails().getBytesProcessed()); + } + + /* + * An End Event informs that the request has + * finished successfully. + */ + @Override + public void visit(SelectObjectContentEvent.EndEvent event) { + isResultComplete.set(true); + LOG.debug("Received End Event. Result is complete."); + } + }); text = new BufferedReader( - new InputStreamReader(resultInputStream, StandardCharsets.UTF_8)) - .lines() + new InputStreamReader(resultInputStream, StandardCharsets.UTF_8)).lines() .collect(Collectors.joining("\n")); } /* @@ -383,24 +337,62 @@ public class AmazonTranscribe implements Transcriber { /** * Private helper function to get object from s3. * - * @param jobName - * The unique job name for each job(UUID). + * @param jobName The unique job name for each job(UUID). * @return TranscriptionJob object */ private TranscriptionJob retrieveObjectWhenJobCompleted(String jobName) { GetTranscriptionJobRequest getTranscriptionJobRequest = new GetTranscriptionJobRequest(); - getTranscriptionJobRequest - .withRequestCredentialsProvider(credsProvider); + getTranscriptionJobRequest.withRequestCredentialsProvider(credsProvider); getTranscriptionJobRequest.setTranscriptionJobName(jobName); while (true) { - GetTranscriptionJobResult innerResult = amazonTranscribeAsync - .getTranscriptionJob(getTranscriptionJobRequest); - String status = innerResult.getTranscriptionJob() - .getTranscriptionJobStatus(); - if (TranscriptionJobStatus.COMPLETED.name().equals(status) - || TranscriptionJobStatus.FAILED.name().equals(status)) { + GetTranscriptionJobResult innerResult = + amazonTranscribeAsync.getTranscriptionJob(getTranscriptionJobRequest); + String status = innerResult.getTranscriptionJob().getTranscriptionJobStatus(); + if (TranscriptionJobStatus.COMPLETED.name().equals(status) || + TranscriptionJobStatus.FAILED.name().equals(status)) { return innerResult.getTranscriptionJob(); } } } -} \ No newline at end of file + + @Override + public void initialize(Map<String, Param> params) throws TikaConfigException { + if (!checkAvailable()) { + return; + } + + try { + BasicAWSCredentials creds = new BasicAWSCredentials(this.clientId, this.clientSecret); + this.credsProvider = new AWSStaticCredentialsProvider(creds); + if (region != null) { + this.amazonS3 = AmazonS3ClientBuilder.standard().withCredentials(credsProvider) + .withRegion(this.region).build(); + } else { + this.amazonS3 = + AmazonS3ClientBuilder.standard().withCredentials(credsProvider).build(); + + } + if (!this.amazonS3.doesBucketExistV2(this.bucketName)) { + try { + amazonS3.createBucket(this.bucketName); + } catch (AmazonS3Exception e) { + throw new TikaConfigException("couldn't create bucket", e); + } + } + this.amazonTranscribeAsync = + AmazonTranscribeAsyncClientBuilder.standard().withCredentials(credsProvider) + .withRegion(this.region).build(); + } catch (Exception e) { + LOG.warn("Exception reading config file", e); + isAvailable = false; + } + + } + + @Override + public void checkInitialization(InitializableProblemHandler problemHandler) + throws TikaConfigException { + //TODO alert user if they've gotten 1 or 2 out of three? + this.isAvailable = checkAvailable(); + } +} diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java index 3b424f9..be4f76a 100644 --- a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java +++ b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java @@ -14,17 +14,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.transcribe; +package org.apache.tika.parser.transcribe.aws; -import org.junit.Before; +import java.io.InputStream; + +import com.amazonaws.services.transcribe.model.LanguageCode; +import org.junit.BeforeClass; import org.junit.Ignore; import org.junit.Test; -import java.io.FileInputStream; - -import static junit.framework.TestCase.assertNotNull; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; +import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; //TODO: Check the ACTUAL output of Amazon Transcribe. @@ -33,13 +35,17 @@ import static org.junit.Assert.fail; * 1) Tests that transcribe functions properly when it is given just a filepath. * 2) Both audio (mp3) and video (mp4) files are used in these tests. */ -@Ignore("Ignore until finalize AmazonTransribe Interface & build Tika") -public class AmazonTranscribeTest { - AmazonTranscribe transcriber; +@Ignore("Ignore until finalize AmazonTrancsribe Interface & build Tika") +public class AmazonTranscribeTest extends TikaTest { + + static Parser PARSER; - @Before - public void setUp() { - transcriber = new AmazonTranscribe(); + @BeforeClass + public static void setUp() throws Exception { + try (InputStream is = AmazonTranscribeTest.class + .getResourceAsStream("tika-config-aws-transcribe.xml")) { + PARSER = new TikaConfig(is).getParser(); + } } /** @@ -47,23 +53,12 @@ public class AmazonTranscribeTest { * The source language of the file is en-US (English - United States) */ @Test - public void testAmazonTranscribeAudio_enUS() { - String audioFilePath = "src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3"; + public void testAmazonTranscribeAudio_enUS() throws Exception { + ParseContext context = new ParseContext(); + context.set(LanguageCode.class, LanguageCode.EnUS); + String xml = getXML("en-US_(A_Little_Bottle_Of_Water).mp3", PARSER, context).xml; String expected = "a little bottle of water."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "en-US"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + assertContains(expected, xml); } /** @@ -71,23 +66,10 @@ public class AmazonTranscribeTest { * The source language of the file is en-US (English - United States) */ @Test - public void testAmazonTranscribeUnknownAudio_enUS() { - String audioFilePath = "src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3"; + public void testAmazonTranscribeUnknownAudio_enUS() throws Exception { + String xml = getXML("en-US_(A_Little_Bottle_Of_Water).mp3", PARSER).xml; String expected = "a little bottle of water."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + assertContains(expected, xml); } /** @@ -95,23 +77,12 @@ public class AmazonTranscribeTest { * The source language of the file is en-US (English - United States) */ @Test - public void testAmazonTranscribeVideo_enUS() { - String videoFilePath = "en-US_(Hi).mp4"; + public void testAmazonTranscribeVideo_enUS() throws Exception { String expected = "Hi"; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(videoFilePath), "en-US"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + ParseContext context = new ParseContext(); + context.set(LanguageCode.class, LanguageCode.EnUS); + String xml = getXML("en-US_(Hi).mp4", PARSER, context).xml; + assertContains(expected, xml); } /** @@ -119,23 +90,10 @@ public class AmazonTranscribeTest { * The source language of the file is en-US (English - United States) */ @Test - public void testAmazonTranscribeUnknownVideo_enUS() { - String videoFilePath = "en-US_(Hi).mp4"; + public void testAmazonTranscribeUnknownVideo_enUS() throws Exception { String expected = "Hi"; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(videoFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + String xml = getXML("en-US_(Hi).mp4", PARSER).xml; + assertContains(expected, xml); } /** @@ -143,23 +101,13 @@ public class AmazonTranscribeTest { * The source language of the file is en-GB (English - Great Britain) */ @Test - public void testAmazonTranscribeAudio_enGB() { - String audioFilePath = "src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3"; + public void testAmazonTranscribeAudio_enGB() throws Exception { + String file = "en-GB_(A_Little_Bottle_Of_Water).mp3"; String expected = "a little bottle of water."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "en-GB"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + ParseContext context = new ParseContext(); + context.set(LanguageCode.class, LanguageCode.EnGB); + String xml = getXML(file, PARSER, context).xml; + assertContains(expected, xml); } /** @@ -167,23 +115,11 @@ public class AmazonTranscribeTest { * The source language of the file is en-GB (English - Great Britain) */ @Test - public void testAmazonTranscribeUnknownAudio_enGB() { - String audioFilePath = "src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3"; + public void testAmazonTranscribeUnknownAudio_enGB() throws Exception { + String file = "en-GB_(A_Little_Bottle_Of_Water).mp3"; String expected = "a little bottle of water."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + String xml = getXML(file, PARSER).xml; + assertContains(expected, xml); } /** @@ -191,23 +127,13 @@ public class AmazonTranscribeTest { * The source language of the file is en-AU (English - Australia) */ @Test - public void testAmazonTranscribeAudio_enAU() { - String source = "src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3"; + public void testAmazonTranscribeAudio_enAU() throws Exception { + String file = "en-AU_(A_Little_Bottle_Of_Water).mp3"; String expected = "a little bottle of water."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(source), "en-AU"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + ParseContext context = new ParseContext(); + context.set(LanguageCode.class, LanguageCode.EnAU); + String xml = getXML(file, PARSER, context).xml; + assertContains(expected, xml); } /** @@ -215,23 +141,11 @@ public class AmazonTranscribeTest { * The source language of the file is en-AU (English - Australian) */ @Test - public void testAmazonTranscribeUnknownAudio_enAU() { - String videoFilePath = "src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3"; + public void testAmazonTranscribeUnknownAudio_enAU() throws Exception { + String file = "en-AU_(A_Little_Bottle_Of_Water).mp3"; String expected = "a little bottle of water."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(videoFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + String xml = getXML(file, PARSER).xml; + assertContains(expected, xml); } /** @@ -239,23 +153,13 @@ public class AmazonTranscribeTest { * The source language of the file is de-DE (German) */ @Test - public void testAmazonTranscribeAudio_deDE() { - String audioFilePath = "src/test/resources/de-DE_(We_Are_At_School_x2).mp3"; + public void testAmazonTranscribeAudio_deDE() throws Exception { + String file = "de-DE_(We_Are_At_School_x2).mp3"; String expected = "Wir sind in der Schule. Wir sind in der Schule."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "de-DE"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + ParseContext context = new ParseContext(); + context.set(LanguageCode.class, LanguageCode.DeDE); + String xml = getXML(file, PARSER, context).xml; + assertContains(expected, xml); } /** @@ -263,23 +167,11 @@ public class AmazonTranscribeTest { * The source language of the file is de-DE (German) */ @Test - public void testAmazonTranscribeUnknownAudio_deDE() { - String audioFilePath = "src/test/resources/de-DE_(We_Are_At_School_x2).mp3"; + public void testAmazonTranscribeUnknownAudio_deDE() throws Exception { + String file = "de-DE_(We_Are_At_School_x2).mp3"; String expected = "Wir sind in der Schule. Wir sind in der Schule."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + String xml = getXML(file, PARSER).xml; + assertContains(expected, xml); } /** @@ -287,23 +179,13 @@ public class AmazonTranscribeTest { * The source language of the file is it-IT (Italian) */ @Test - public void testAmazonTranscribeAudio_itIT() { - String audioFilePath = "src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3"; + public void testAmazonTranscribeAudio_itIT() throws Exception { + String file = "it-IT_(We_Are_Having_Class_x2).mp3"; String expected = "stiamo facendo lezione. stiamo facendo lezione."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "it-IT"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + ParseContext context = new ParseContext(); + context.set(LanguageCode.class, LanguageCode.ItIT); + String xml = getXML(file, PARSER, context).xml; + assertContains(expected, xml); } /** @@ -311,23 +193,11 @@ public class AmazonTranscribeTest { * The source language of the file is it-IT (Italian) */ @Test - public void testAmazonTranscribeUnknownAudio_itIT() { - String audioFilePath = "src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3"; + public void testAmazonTranscribeUnknownAudio_itIT() throws Exception { + String file = "it-IT_(We_Are_Having_Class_x2).mp3"; String expected = "stiamo facendo lezione. stiamo facendo lezione."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + String xml = getXML(file, PARSER).xml; + assertContains(expected, xml); } /** @@ -335,23 +205,14 @@ public class AmazonTranscribeTest { * The source language of the file is ja-JP (Japanese) */ @Test - public void testAmazonTranscribeAudio_jaJP() { - String audioFilePath = "src/test/resources/ja-JP_(We_Are_At_School).mp3"; + public void testAmazonTranscribeAudio_jaJP() throws Exception { + String file = "ja-JP_(We_Are_At_School).mp3"; String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu - String result; + ParseContext context = new ParseContext(); + context.set(LanguageCode.class, LanguageCode.JaJP); + String xml = getXML(file, PARSER, context).xml; + assertContains(expected, xml); - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "ja-JP"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } } /** @@ -359,23 +220,11 @@ public class AmazonTranscribeTest { * The source language of the file is ja-JP (Japanese) */ @Test - public void testAmazonTranscribeUnknownAudio_jaJP() { - String audioFilePath = "src/test/resources/ja-JP_(We_Are_At_School).mp3"; + public void testAmazonTranscribeUnknownAudio_jaJP() throws Exception { + String file = "ja-JP_(We_Are_At_School).mp3"; String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + String xml = getXML(file, PARSER).xml; + assertContains(expected, xml); } /** @@ -383,23 +232,13 @@ public class AmazonTranscribeTest { * The source language of the file is ko-KR (Korean) */ @Test - public void testAmazonTranscribeAudio_koKR() { - String audioFilePath = "src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3"; + public void testAmazonTranscribeAudio_koKR() throws Exception { + String file = "ko-KR_(We_Are_Having_Class_x2).mp3"; String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "ko-KR"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + ParseContext context = new ParseContext(); + context.set(LanguageCode.class, LanguageCode.KoKR); + String xml = getXML(file, PARSER, context).xml; + assertContains(expected, xml); } /** @@ -407,23 +246,11 @@ public class AmazonTranscribeTest { * The source language of the file is ko-KR (Korean) */ @Test - public void testAmazonTranscribeUnknownAudio_koKR() { - String audioFilePath = "src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3"; + public void testAmazonTranscribeUnknownAudio_koKR() throws Exception { + String file = "ko-KR_(We_Are_Having_Class_x2).mp3"; String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + String xml = getXML(file, PARSER).xml; + assertContains(expected, xml); } /** @@ -431,24 +258,14 @@ public class AmazonTranscribeTest { * The source language of the file is ko-KR (Korean) */ @Test - public void testAmazonTranscribeVideo_koKR() { - String source = "src/test/resources/ko-KR_(Annyeonghaseyo).mp4"; + public void testAmazonTranscribeVideo_koKR() throws Exception { + String file = "ko-KR_(Annyeonghaseyo).mp4"; //TODO: Check whether output is Annyeonghaseyo or 안녕하세요 String expected = "Annyeonghaseyo"; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(source), "ko-KR"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + ParseContext context = new ParseContext(); + context.set(LanguageCode.class, LanguageCode.KoKR); + String xml = getXML(file, PARSER, context).xml; + assertContains(expected, xml); } /** @@ -456,24 +273,12 @@ public class AmazonTranscribeTest { * The source language of the file is ko-KR (Korean) */ @Test - public void testAmazonTranscribeUnknownVideo_koKR() { - String source = "src/test/resources/ko-KR_(Annyeonghaseyo).mp4"; + public void testAmazonTranscribeUnknownVideo_koKR() throws Exception { + String file = "ko-KR_(Annyeonghaseyo).mp4"; //TODO: Check whether output is Annyeonghaseyo or 안녕하세요 String expected = "Annyeonghaseyo"; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(source)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + String xml = getXML(file, PARSER).xml; + assertContains(expected, xml); } /** @@ -481,23 +286,13 @@ public class AmazonTranscribeTest { * The source language of the file is pt-BR (Portuguese - Brazil) */ @Test - public void testAmazonTranscribeAudio_ptBR() { - String audioFilePath = "src/test/resources/pt-BR_(We_Are_At_School).mp3"; + public void testAmazonTranscribeAudio_ptBR() throws Exception { + String file = "pt-BR_(We_Are_At_School).mp3"; String expected = "nós estamos na escola."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "pt-BR"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + ParseContext context = new ParseContext(); + context.set(LanguageCode.class, LanguageCode.PtBR); + String xml = getXML(file, PARSER, context).xml; + assertContains(expected, xml); } /** @@ -505,23 +300,11 @@ public class AmazonTranscribeTest { * The source language of the file is pt-BR (Portuguese - Brazil) */ @Test - public void testAmazonTranscribeUnknownAudio_ptBR() { - String audioFilePath = "src/test/resources/pt-BR_(We_Are_At_School).mp3"; + public void testAmazonTranscribeUnknownAudio_ptBR() throws Exception { + String file = "pt-BR_(We_Are_At_School).mp3"; String expected = "nós estamos na escola."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } + String xml = getXML(file, PARSER).xml; + assertContains(expected, xml); } } diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/tika-config-aws-transcribe.xml b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/tika-config-aws-transcribe.xml new file mode 100644 index 0000000..fb23d38 --- /dev/null +++ b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/resources/tika-config-aws-transcribe.xml @@ -0,0 +1,29 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"/> + <parser class="org.apache.tika.parser.transcribe.aws.AmazonTranscribe"> + <params> + <param name="bucket" type="string">bucket</param> + <param name="clientId" type="string">clientId</param> + <param name="clientSecret" type="string">clientSecret</param> + </params> + </parser> + </parsers> +</properties> diff --git a/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator b/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator deleted file mode 100644 index 1256ab6..0000000 --- a/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.tika.language.translate.amazontranscribe diff --git a/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties b/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties deleted file mode 100644 index 043a66f..0000000 --- a/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -transcribe.AWS_ACCESS_KEY=dummy_key -transcribe.AWS_SECRET_KEY=dummy_key -transcribe.BUCKET_NAME=dummy_name
