This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 93d2211037b01ca237a51f83879ae35f3f76dca8 Author: tallison <[email protected]> AuthorDate: Tue May 18 05:42:18 2021 -0400 TIKA-3384 -- convert transcribe to a traditional parser --- pom.xml | 1 - .../org/apache/tika/transcribe/Transcriber.java | 60 --- tika-example/pom.xml | 8 +- .../tika/example/TranscribeTranslateExample.java | 71 +-- tika-parsers/tika-parsers-ml/pom.xml | 1 + tika-transcribe/pom.xml | 159 ------- .../apache/tika/transcribe/AmazonTranscribe.java | 406 ---------------- .../org.apache.tika.language.translate.Translator | 16 - .../transcribe.amazon.properties | 18 - .../tika/transcribe/AmazonTranscribeTest.java | 527 --------------------- .../src/test/resources/ShortAudioSampleFrench.mp3 | Bin 25861 -> 0 bytes .../test/resources/de-DE_(We_Are_At_School_x2).mp3 | Bin 38547 -> 0 bytes .../resources/en-AU_(A_Little_Bottle_Of_Water).mp3 | Bin 33365 -> 0 bytes .../resources/en-GB_(A_Little_Bottle_Of_Water).mp3 | Bin 35872 -> 0 bytes .../resources/en-US_(A_Little_Bottle_Of_Water).mp3 | Bin 29603 -> 0 bytes tika-transcribe/src/test/resources/en-US_(Hi).mp4 | Bin 21739 -> 0 bytes .../resources/it-IT_(We_Are_Having_Class_x2).mp3 | Bin 42219 -> 0 bytes .../test/resources/ja-JP_(We_Are_At_School).mp3 | Bin 21699 -> 0 bytes .../src/test/resources/ko-KR_(Annyeonghaseyo).mp4 | Bin 144151 -> 0 bytes .../resources/ko-KR_(We_Are_Having_Class_x2).mp3 | Bin 66843 -> 0 bytes .../test/resources/pt-BR_(We_Are_At_School).mp3 | Bin 29043 -> 0 bytes 21 files changed, 47 insertions(+), 1220 deletions(-) diff --git a/pom.xml b/pom.xml index f8c6591..d0e43d4 100644 --- a/pom.xml +++ b/pom.xml @@ -52,7 +52,6 @@ <module>tika-translate</module> <module>tika-example</module> <module>tika-java7</module> - <module>tika-transcribe</module> </modules> <profiles> diff --git a/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java b/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java deleted file mode 100644 index 3546256..0000000 --- a/tika-core/src/main/java/org/apache/tika/transcribe/Transcriber.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.transcribe; - -import java.io.IOException; -import java.io.InputStream; - -import org.apache.tika.exception.TikaException; - -/** - * Interface for Transcriber services. - * - * @see <a href="https://issues.apache.org/jira/browse/TIKA-94">TIKA-94</a> - * @since Tika 2.1 - */ -public interface Transcriber { - /** - * Transcribe the given file. - * - * @param inputStream the source input stream. - * @return The transcribed string result, NULL if the job failed. - * @throws TikaException When there is an error transcribing. - * @throws IOException If an I/O exception of some sort has occurred. - * @since 2.1 - */ - public String transcribe(InputStream inputStream) throws TikaException, IOException; - - /** - * Transcribe the given the file and the source language. - * - * @param inputStream the source input stream. - * @param sourceLanguage The language code for the language used in the input media file. - * @return The transcribed string result, NULL if the job failed. - * @throws TikaException When there is an error transcribing. - * @throws IOException If an I/O exception of some sort has occurred. - * @since 2.1 - */ - public String transcribe(InputStream inputStream, String sourceLanguage) throws TikaException, IOException; - - /** - * @return true if this Transcriber is probably able to transcribe right now. - * @since Tika 2.1 - */ - public boolean isAvailable(); -} diff --git a/tika-example/pom.xml b/tika-example/pom.xml index f12304e..ce6a2b3 100644 --- a/tika-example/pom.xml +++ b/tika-example/pom.xml @@ -64,13 +64,13 @@ <version>${project.version}</version> </dependency> <dependency> - <groupId>org.apache.tika</groupId> + <groupId>${project.groupId}</groupId> <artifactId>tika-eval-core</artifactId> <version>${project.version}</version> </dependency> <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-transcribe</artifactId> + <groupId>${project.groupId}</groupId> + <artifactId>tika-transcribe-aws</artifactId> <version>${project.version}</version> <exclusions> <exclusion> @@ -88,7 +88,7 @@ </exclusions> </dependency> <dependency> - <groupId>org.apache.tika</groupId> + <groupId>${project.groupId}</groupId> <artifactId>tika-core</artifactId> <version>${project.version}</version> <type>test-jar</type> diff --git a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java index 12dd7e5..f77af72 100644 --- a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java @@ -17,12 +17,14 @@ package org.apache.tika.example; -import java.io.FileInputStream; +import java.nio.file.Path; +import java.nio.file.Paths; +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; import org.apache.tika.language.translate.GoogleTranslator; import org.apache.tika.language.translate.Translator; -import org.apache.tika.transcribe.AmazonTranscribe; -import org.apache.tika.transcribe.Transcriber; +import org.apache.tika.parser.transcribe.aws.AmazonTranscribe; /** * This example demonstrates primitive logic for @@ -30,8 +32,8 @@ import org.apache.tika.transcribe.Transcriber; * could be considered as a downstream process to * transcription. * We simply pass the output of - * a call to {@link Transcriber#transcribe(java.io.InputStream)} - * into {@link Translator#translate(String, String)}. + * a call to {@link Tika#parseToString(Path)} + * into {@link Translator#translate(String, String)}. * The {@link GoogleTranslator} is configured with a target * language of "en-US". * @author lewismc @@ -62,42 +64,53 @@ public class TranscribeTranslateExample { /** * Use {@link AmazonTranscribe} to execute transcription on input data. - * This implementation needs configured as explained in the Javadoc. + * This implementation needs to be configured as explained in the Javadoc. * @param file the name of the file (which needs to be on the Java Classpath) to transcribe. * @return transcribed text. */ - public static String amazonTranscribe(String file) { - String filePath = TranscribeTranslateExample.class.getClassLoader().getResource(file).getPath(); - String result = null; - Transcriber transcriber = new AmazonTranscribe(); - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(filePath)); - } catch (Exception e) { - e.printStackTrace(); - } - } - return result; + public static String amazonTranscribe(Path tikaConfig, Path file) throws Exception { + return new Tika(new TikaConfig(tikaConfig)).parseToString(file); } /** * Main method to run this example. This program can be invoked as follows * <ol> - * <li><code>transcribe-translate ${file}</code>; which executes both + * <li><code>transcribe-translate ${tika-config.xml} ${file}</code>; which executes both * transcription then translation on the given resource, or - * <li><code>transcribe ${file}</code>; which executes only translation</li> + * <li><code>transcribe ${tika-config.xml} ${file}</code>; which executes only translation</li> * @param args either of the commands described above and the input file - * (which needs to be on the Java Classpath). + * (which needs to be on the Java Classpath). + * + * + * + * ${tika-config.xml} must include credentials for aws and a temporary storage bucket: + * <pre> + * {@code + * <properties> + * <parsers> + * <parser class="org.apache.tika.parser.DefaultParser"/> + * <parser class="org.apache.tika.parser.transcribe.aws.AmazonTranscribe"> + * <params> + * <param name="bucket" type="string">bucket</param> + * <param name="clientId" type="string">clientId</param> + * <param name="clientSecret" type="string">clientSecret</param> + * </params> + * </parser> + * </parsers> + * </properties> + * } + * </pre> */ - public static void main (String[] args) { + public static void main (String[] args) throws Exception { String text = null; - if (args.length != 0) { - if ("transcribe-translate".equals(args[0])) { - text = googleTranslateToEnglish(amazonTranscribe(args[1])); - System.out.print("Transcription and translation successful!\nEXTRAXCTED TEXT: " + text); - } else if ("transcribe".equals(args[0])) { - text = amazonTranscribe(args[1]); - System.out.print("Transcription successful!\nEXTRAXCTED TEXT: " + text); + if (args.length > 1) { + if ("transcribe-translate".equals(args[1])) { + text = googleTranslateToEnglish(amazonTranscribe(Paths.get(args[0]), + Paths.get(args[1]))); + System.out.print("Transcription and translation successful!\nEXTRACTED TEXT: " + text); + } else if ("transcribe".equals(args[1])) { + text = amazonTranscribe(Paths.get(args[0]), Paths.get(args[1])); + System.out.print("Transcription successful!\nEXTRACTED TEXT: " + text); } else { System.out.print("Incorrect invocation, see Javadoc."); } diff --git a/tika-parsers/tika-parsers-ml/pom.xml b/tika-parsers/tika-parsers-ml/pom.xml index ba9bd38..2dcde9e 100644 --- a/tika-parsers/tika-parsers-ml/pom.xml +++ b/tika-parsers/tika-parsers-ml/pom.xml @@ -40,6 +40,7 @@ <module>tika-age-recogniser</module> <module>tika-parser-advancedmedia-module</module> <module>tika-dl</module> + <module>tika-transcribe-aws</module> </modules> <build> diff --git a/tika-transcribe/pom.xml b/tika-transcribe/pom.xml deleted file mode 100644 index aadb137..0000000 --- a/tika-transcribe/pom.xml +++ /dev/null @@ -1,159 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<project xmlns="http://maven.apache.org/POM/4.0.0" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - - <parent> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parent</artifactId> - <version>2.0.0-SNAPSHOT</version> - <relativePath>../tika-parent/pom.xml</relativePath> - </parent> - - <artifactId>tika-transcribe</artifactId> - <packaging>bundle</packaging> - <name>Apache Tika transcribe</name> - <url>http://tika.apache.org/</url> - <!--TODO use latest aws version or the one defined in the tika-parent--> - <dependencies> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>com.amazonaws</groupId> - <artifactId>aws-java-sdk-transcribe</artifactId> - <version>${aws.version}</version> - <exclusions> - <exclusion> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - </exclusion> - <exclusion> - <groupId>commons-codec</groupId> - <artifactId>commons-codec</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>com.amazonaws</groupId> - <artifactId>aws-java-sdk-s3</artifactId> - <version>${aws.version}</version> - </dependency> - <dependency> - <groupId>com.googlecode.json-simple</groupId> - <artifactId>json-simple</artifactId> - <version>${json.simple.version}</version> - </dependency> - <!-- Test dependencies --> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - </dependency> - </dependencies> - <build> - <plugins> - <plugin> - <groupId>org.apache.felix</groupId> - <artifactId>maven-bundle-plugin</artifactId> - <version>${maven.bundle.version}</version> - <extensions>true</extensions> - <configuration> - <instructions> - <Bundle-DocURL>${project.url}</Bundle-DocURL> - <Bundle-Activator> - org.apache.tika.parser.internal.Activator - </Bundle-Activator> - <Import-Package> - org.w3c.dom, - org.apache.tika.*, - *;resolution:=optional - </Import-Package> - </instructions> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.rat</groupId> - <artifactId>apache-rat-plugin</artifactId> - <version>${rat.version}</version> - <configuration> - <excludes> - <exclude>src/main/java/org/apache/tika/parser/txt/Charset*.java</exclude> - <exclude>src/test/resources/test-documents/**</exclude> - </excludes> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> - <configuration> - <archive> - <manifestEntries> - <Automatic-Module-Name>org.apache.tika.translate</Automatic-Module-Name> - </manifestEntries> - </archive> - </configuration> - <executions> - <execution> - <goals> - <goal>test-jar</goal> - </goals> - </execution> - </executions> - </plugin> - </plugins> - - <pluginManagement> - <plugins> - <!-- This plugin's configuration is used to store Eclipse m2e --> - <!-- settings only. It has no influence on the Maven build itself. --> - <plugin> - <groupId>org.eclipse.m2e</groupId> - <artifactId>lifecycle-mapping</artifactId> - <version>1.0.0</version> - <configuration> - <lifecycleMappingMetadata> - <pluginExecutions> - <pluginExecution> - <pluginExecutionFilter> - <groupId>org.apache.felix</groupId> - <artifactId>maven-scr-plugin</artifactId> - <version>${maven.scr.version}</version> - <goals> - <goal>scr</goal> - </goals> - </pluginExecutionFilter> - <action> - <execute/> - </action> - </pluginExecution> - </pluginExecutions> - </lifecycleMappingMetadata> - </configuration> - </plugin> - </plugins> - </pluginManagement> - </build> -</project> \ No newline at end of file diff --git a/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java b/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java deleted file mode 100644 index 5b50491..0000000 --- a/tika-transcribe/src/main/java/org/apache/tika/transcribe/AmazonTranscribe.java +++ /dev/null @@ -1,406 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.transcribe; - -import com.amazonaws.AmazonServiceException; -import com.amazonaws.SdkClientException; -import com.amazonaws.auth.AWSStaticCredentialsProvider; -import com.amazonaws.auth.BasicAWSCredentials; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3ClientBuilder; -import com.amazonaws.services.s3.model.AmazonS3Exception; -import com.amazonaws.services.s3.model.CompressionType; -import com.amazonaws.services.s3.model.ExpressionType; -import com.amazonaws.services.s3.model.InputSerialization; -import com.amazonaws.services.s3.model.JSONInput; -import com.amazonaws.services.s3.model.JSONOutput; -import com.amazonaws.services.s3.model.JSONType; -import com.amazonaws.services.s3.model.OutputSerialization; -import com.amazonaws.services.s3.model.PutObjectRequest; -import com.amazonaws.services.s3.model.PutObjectResult; -import com.amazonaws.services.s3.model.SelectObjectContentEvent; -import com.amazonaws.services.s3.model.SelectObjectContentEventVisitor; -import com.amazonaws.services.s3.model.SelectObjectContentRequest; -import com.amazonaws.services.s3.model.SelectObjectContentResult; -import com.amazonaws.services.transcribe.AmazonTranscribeAsync; -import com.amazonaws.services.transcribe.AmazonTranscribeAsyncClientBuilder; -import com.amazonaws.services.transcribe.model.Media; -import com.amazonaws.services.transcribe.model.StartTranscriptionJobRequest; -import com.amazonaws.services.transcribe.model.TranscriptionJob; -import com.amazonaws.services.transcribe.model.TranscriptionJobStatus; -import com.amazonaws.services.transcribe.model.GetTranscriptionJobRequest; -import com.amazonaws.services.transcribe.model.GetTranscriptionJobResult; -import com.amazonaws.services.transcribe.model.LanguageCode; -import org.apache.tika.exception.TikaException; -import org.json.simple.JSONObject; -import org.json.simple.parser.JSONParser; -import org.json.simple.parser.ParseException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; -import java.util.Properties; -import java.util.UUID; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.stream.Collectors; - -/** - * <a href="https://aws.amazon.com/transcribe/">Amazon Transcribe</a> - * {@link Transcriber} implementation. See Javadoc for configiration options. - * - * @since Tika 2.1 - */ -public class AmazonTranscribe implements Transcriber { - - public static final String PROPERTIES_FILE = "transcribe.amazon.properties"; - public static final String ID_PROPERTY = "transcribe.AWS_ACCESS_KEY"; - public static final String SECRET_PROPERTY = "transcribe.AWS_SECRET_KEY"; - public static final String DEFAULT_ID = "dummy-id"; - public static final String DEFAULT_SECRET = "dummy-secret"; - public static final String DEFAULT_BUCKET = "dummy-bucket"; - public static final String BUCKET_NAME = "transcribe.BUCKET_NAME"; - public static final String REGION = "transcribe.REGION"; - private static final Logger LOG = LoggerFactory - .getLogger(AmazonTranscribe.class); - private AmazonTranscribeAsync amazonTranscribeAsync; - private AmazonS3 amazonS3; - private String bucketName; - private String region; - private boolean isAvailable; // Flag for whether or not transcription is - // available. - private String clientId; - private String clientSecret; // Keys used for the API calls. - private AWSStaticCredentialsProvider credsProvider; - - /** - * Create a new AmazonTranscribe instance with the client keys specified in - * <code>transcribe.amazon.properties</code> which needs to be available on - * the Java Classpath. - * Silently becomes unavailable when client keys are unavailable. - * <code>transcribe.AWS_ACCESS_KEY</code>, - * <code>transcribe.AWS_SECRET_KEY</code>, - * <code>transcribe.BUCKET_NAME</code> and - * <code>transcribe.REGION</code> must be set in - * <code>transcribe.amazon.properties</code>. - * <b>N.B.</b> it is not necessary to create the bucket before hand. - * This implementation will automatically create the bucket if one - * does not alrerady exist, per the name defined above. - * - * @since Tika 2.0 - */ - public AmazonTranscribe() { - Properties config = new Properties(); - try { - config.load(AmazonTranscribe.class - .getResourceAsStream(PROPERTIES_FILE)); - this.clientId = config.getProperty(ID_PROPERTY); - this.clientSecret = config.getProperty(SECRET_PROPERTY); - this.bucketName = config.getProperty(BUCKET_NAME); - this.region = config.getProperty(REGION); - BasicAWSCredentials creds = new BasicAWSCredentials(this.clientId, - this.clientSecret); - this.credsProvider = new AWSStaticCredentialsProvider(creds); - amazonS3 = AmazonS3ClientBuilder.standard() - .withCredentials(credsProvider).withRegion(this.region) - .build(); - this.isAvailable = checkAvailable(); - if (!this.amazonS3.doesBucketExistV2(this.bucketName)) { - try { - amazonS3.createBucket(this.bucketName); - } catch (AmazonS3Exception e) { - throw new RuntimeException(e.getErrorMessage()); - } - } - this.amazonTranscribeAsync = AmazonTranscribeAsyncClientBuilder - .standard().withCredentials(credsProvider) - .withRegion(this.region).build(); - } catch (Exception e) { - LOG.warn("Exception reading config file", e); - isAvailable = false; - } - } - - /** - * private method to get a unique job key. - * - * @return unique job key. - */ - private String getJobKey() { - return UUID.randomUUID().toString(); - } - - /** - * Constructs a new {@link PutObjectRequest} object to upload a file to the - * specified bucket and jobName. After constructing the request, users may - * optionally specify object metadata or a canned ACL as well. - * - * @param inputStream, null - * The file to upload to Amazon S3. - * @param jobName - * The unique job name for each job(UUID). - */ - private void uploadFileToBucket(InputStream inputStream, String jobName) - throws TikaException { - PutObjectRequest request = new PutObjectRequest(this.bucketName, - jobName, inputStream, null); - try { - @SuppressWarnings("unused") - PutObjectResult response = amazonS3.putObject(request); - } catch (SdkClientException e) { - throw (new TikaException("File Upload to AWS Failed")); - } - } - - /** - * Starts AWS Transcribe Job without language specification. - * - * @param inputStream - * the source input stream. - * @return The transcribed string result, NULL if the job failed. - * @throws TikaException - * When there is an error transcribing. - * @throws IOException - * If an I/O exception of some sort has occurred. - */ - @Override - public String transcribe(InputStream inputStream) - throws TikaException, IOException { - if (!isAvailable()) - return null; - String jobName = getJobKey(); - uploadFileToBucket(inputStream, jobName); - StartTranscriptionJobRequest startTranscriptionJobRequest = new StartTranscriptionJobRequest(); - Media media = new Media(); - media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString()); - startTranscriptionJobRequest.withIdentifyLanguage(true).withMedia(media) - .withOutputBucketName(this.bucketName) - .withTranscriptionJobName(jobName) - .setRequestCredentialsProvider(credsProvider); - amazonTranscribeAsync - .startTranscriptionJob(startTranscriptionJobRequest); - return getTranscriptText(jobName); - } - - /** - * Starts AWS Transcribe Job with language specification. - * - * @param inputStream - * the source input stream. - * @param sourceLanguage - * <a href= - * "https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS - * Language Code</a> for the language used in the input media - * file. - * @return The transcribed string result, NULL if the job failed. - * @throws TikaException - * When there is an error transcribing. - * @throws IOException - * If an I/O exception of some sort has occurred. - * @see <a href= - * "https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS - * Language Code</a> - */ - @Override - public String transcribe(InputStream inputStream, String sourceLanguage) - throws TikaException, IOException { - if (!isAvailable()) - return null; - String jobName = getJobKey(); - uploadFileToBucket(inputStream, jobName); - StartTranscriptionJobRequest startTranscriptionJobRequest = new StartTranscriptionJobRequest(); - Media media = new Media(); - media.setMediaFileUri(amazonS3.getUrl(bucketName, jobName).toString()); - ((StartTranscriptionJobRequest) startTranscriptionJobRequest - .withMedia(media).withOutputBucketName(this.bucketName) - .withTranscriptionJobName(jobName) - .withRequestCredentialsProvider(credsProvider)) - .withLanguageCode( - LanguageCode.fromValue(sourceLanguage)); - amazonTranscribeAsync - .startTranscriptionJob(startTranscriptionJobRequest); - return getTranscriptText(jobName); - } - - /** - * @return true if this Transcriber is probably able to transcribe right - * now. - * @since Tika 2.1 - */ - @Override - public boolean isAvailable() { - return this.isAvailable; - } - - /** - * Sets the client Id for the transcriber API. - * - * @param id - * The ID to set. - */ - public void setId(String id) { - this.clientId = id; - this.isAvailable = checkAvailable(); - } - - /** - * Sets the client secret for the transcriber API. - * - * @param secret - * The secret to set. - */ - public void setSecret(String secret) { - this.clientSecret = secret; - this.isAvailable = checkAvailable(); - } - - /** - * Sets the client secret for the transcriber API. - * - * @param bucket - * The bucket to set. - */ - public void setBucket(String bucket) { - this.bucketName = bucket; - this.isAvailable = checkAvailable(); - } - - /** - * Private method check if the service is available. - * - * @return if the service is available - */ - private boolean checkAvailable() { - return clientId != null && !clientId.equals(DEFAULT_ID) - && clientSecret != null && !clientSecret.equals(DEFAULT_SECRET) - && bucketName != null && !bucketName.equals(DEFAULT_BUCKET); - } - - /** - * Gets Transcription result from AWS S3 bucket given the jobName. - * - * @param fileNameS3 - * The path of the file to upload to Amazon S3. - * @return The transcribed string result, NULL if the job failed. - * @throws IOException possible reasons include (i) an End Event is not received - * from AWS S3 SelectObjectContentResult operation and (ii) a parse exception - * whilst processing JSON from the AWS S3 SelectObjectContentResult operation. - * @throws SdkClientException a AWS-specific exception related to SelectObjectContentResult - * operation. - * @throws AmazonServiceException possibly thrown if there is an issue selecting object content - * from AWS S3 objects. - */ - private String getTranscriptText(String fileNameS3) throws AmazonServiceException, SdkClientException, IOException { - TranscriptionJob transcriptionJob = retrieveObjectWhenJobCompleted( - fileNameS3); - String text = null; - if (transcriptionJob != null && !TranscriptionJobStatus.FAILED.name() - .equals(transcriptionJob.getTranscriptionJobStatus())) { - InputSerialization inputSerialization = new InputSerialization().withJson(new JSONInput().withType(JSONType.DOCUMENT)) - .withCompressionType(CompressionType.NONE); - OutputSerialization outputSerialization = new OutputSerialization().withJson(new JSONOutput()); - SelectObjectContentRequest request = new SelectObjectContentRequest() - .withBucketName(this.bucketName).withKey(fileNameS3 + ".json") - .withExpression("Select s.results.transcripts[0].transcript from S3Object s")//WHERE transcript IS NOT MISSING - .withExpressionType(ExpressionType.SQL).withRequestCredentialsProvider(credsProvider); - request.setInputSerialization(inputSerialization); - request.setOutputSerialization(outputSerialization); - - final AtomicBoolean isResultComplete = new AtomicBoolean(false); - - try (SelectObjectContentResult result = amazonS3 - .selectObjectContent(request)) { - InputStream resultInputStream = result.getPayload() - .getRecordsInputStream( - new SelectObjectContentEventVisitor() { - @Override - public void visit( - SelectObjectContentEvent.StatsEvent event) { - LOG.debug( - "Received Stats, Bytes Scanned: " - + event.getDetails() - .getBytesScanned() - + " Bytes Processed: " - + event.getDetails() - .getBytesProcessed()); - } - - /* - * An End Event informs that the request has - * finished successfully. - */ - @Override - public void visit( - SelectObjectContentEvent.EndEvent event) { - isResultComplete.set(true); - LOG.debug( - "Received End Event. Result is complete."); - } - }); - text = new BufferedReader( - new InputStreamReader(resultInputStream, StandardCharsets.UTF_8)) - .lines() - .collect(Collectors.joining("\n")); - } - /* - * The End Event indicates all matching records have been - * transmitted. If the End Event is not received, the results - * may be incomplete. - */ - if (!isResultComplete.get()) { - throw new IOException( - "S3 Select request was incomplete as End Event was not received."); - } - } - JSONParser parser = new JSONParser(); - JSONObject obj = null; - try { - obj = (JSONObject) parser.parse(text); - } catch (ParseException e) { - throw new IOException(e.getMessage(), e); - } - return obj.get("transcript").toString(); - } - - /** - * Private helper function to get object from s3. - * - * @param jobName - * The unique job name for each job(UUID). - * @return TranscriptionJob object - */ - private TranscriptionJob retrieveObjectWhenJobCompleted(String jobName) { - GetTranscriptionJobRequest getTranscriptionJobRequest = new GetTranscriptionJobRequest(); - getTranscriptionJobRequest - .withRequestCredentialsProvider(credsProvider); - getTranscriptionJobRequest.setTranscriptionJobName(jobName); - while (true) { - GetTranscriptionJobResult innerResult = amazonTranscribeAsync - .getTranscriptionJob(getTranscriptionJobRequest); - String status = innerResult.getTranscriptionJob() - .getTranscriptionJobStatus(); - if (TranscriptionJobStatus.COMPLETED.name().equals(status) - || TranscriptionJobStatus.FAILED.name().equals(status)) { - return innerResult.getTranscriptionJob(); - } - } - } -} \ No newline at end of file diff --git a/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator b/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator deleted file mode 100644 index 1256ab6..0000000 --- a/tika-transcribe/src/main/resources/META-INF.services/org.apache.tika.language.translate.Translator +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.tika.language.translate.amazontranscribe diff --git a/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties b/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties deleted file mode 100644 index 043a66f..0000000 --- a/tika-transcribe/src/main/resources/org.apache.tika.transcribe/transcribe.amazon.properties +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -transcribe.AWS_ACCESS_KEY=dummy_key -transcribe.AWS_SECRET_KEY=dummy_key -transcribe.BUCKET_NAME=dummy_name diff --git a/tika-transcribe/src/test/java/org/apache/tika/transcribe/AmazonTranscribeTest.java b/tika-transcribe/src/test/java/org/apache/tika/transcribe/AmazonTranscribeTest.java deleted file mode 100644 index 3b424f9..0000000 --- a/tika-transcribe/src/test/java/org/apache/tika/transcribe/AmazonTranscribeTest.java +++ /dev/null @@ -1,527 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.transcribe; - -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; - -import java.io.FileInputStream; - -import static junit.framework.TestCase.assertNotNull; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; - -//TODO: Check the ACTUAL output of Amazon Transcribe. - -/** - * Tests tika-trancribe by creating an AmazonTranscribe() object. - * 1) Tests that transcribe functions properly when it is given just a filepath. - * 2) Both audio (mp3) and video (mp4) files are used in these tests. - */ -@Ignore("Ignore until finalize AmazonTransribe Interface & build Tika") -public class AmazonTranscribeTest { - AmazonTranscribe transcriber; - - @Before - public void setUp() { - transcriber = new AmazonTranscribe(); - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is en-US (English - United States) - */ - @Test - public void testAmazonTranscribeAudio_enUS() { - String audioFilePath = "src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3"; - String expected = "a little bottle of water."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "en-US"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is en-US (English - United States) - */ - @Test - public void testAmazonTranscribeUnknownAudio_enUS() { - String audioFilePath = "src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3"; - String expected = "a little bottle of water."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is en-US (English - United States) - */ - @Test - public void testAmazonTranscribeVideo_enUS() { - String videoFilePath = "en-US_(Hi).mp4"; - String expected = "Hi"; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(videoFilePath), "en-US"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with a video file without passing in the source language. - * The source language of the file is en-US (English - United States) - */ - @Test - public void testAmazonTranscribeUnknownVideo_enUS() { - String videoFilePath = "en-US_(Hi).mp4"; - String expected = "Hi"; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(videoFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is en-GB (English - Great Britain) - */ - @Test - public void testAmazonTranscribeAudio_enGB() { - String audioFilePath = "src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3"; - String expected = "a little bottle of water."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "en-GB"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is en-GB (English - Great Britain) - */ - @Test - public void testAmazonTranscribeUnknownAudio_enGB() { - String audioFilePath = "src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3"; - String expected = "a little bottle of water."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is en-AU (English - Australia) - */ - @Test - public void testAmazonTranscribeAudio_enAU() { - String source = "src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3"; - String expected = "a little bottle of water."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(source), "en-AU"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is en-AU (English - Australian) - */ - @Test - public void testAmazonTranscribeUnknownAudio_enAU() { - String videoFilePath = "src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3"; - String expected = "a little bottle of water."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(videoFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is de-DE (German) - */ - @Test - public void testAmazonTranscribeAudio_deDE() { - String audioFilePath = "src/test/resources/de-DE_(We_Are_At_School_x2).mp3"; - String expected = "Wir sind in der Schule. Wir sind in der Schule."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "de-DE"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is de-DE (German) - */ - @Test - public void testAmazonTranscribeUnknownAudio_deDE() { - String audioFilePath = "src/test/resources/de-DE_(We_Are_At_School_x2).mp3"; - String expected = "Wir sind in der Schule. Wir sind in der Schule."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is it-IT (Italian) - */ - @Test - public void testAmazonTranscribeAudio_itIT() { - String audioFilePath = "src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3"; - String expected = "stiamo facendo lezione. stiamo facendo lezione."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "it-IT"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is it-IT (Italian) - */ - @Test - public void testAmazonTranscribeUnknownAudio_itIT() { - String audioFilePath = "src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3"; - String expected = "stiamo facendo lezione. stiamo facendo lezione."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is ja-JP (Japanese) - */ - @Test - public void testAmazonTranscribeAudio_jaJP() { - String audioFilePath = "src/test/resources/ja-JP_(We_Are_At_School).mp3"; - String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "ja-JP"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is ja-JP (Japanese) - */ - @Test - public void testAmazonTranscribeUnknownAudio_jaJP() { - String audioFilePath = "src/test/resources/ja-JP_(We_Are_At_School).mp3"; - String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is ko-KR (Korean) - */ - @Test - public void testAmazonTranscribeAudio_koKR() { - String audioFilePath = "src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3"; - String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "ko-KR"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is ko-KR (Korean) - */ - @Test - public void testAmazonTranscribeUnknownAudio_koKR() { - String audioFilePath = "src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3"; - String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with a video file given the source language - * The source language of the file is ko-KR (Korean) - */ - @Test - public void testAmazonTranscribeVideo_koKR() { - String source = "src/test/resources/ko-KR_(Annyeonghaseyo).mp4"; - //TODO: Check whether output is Annyeonghaseyo or 안녕하세요 - String expected = "Annyeonghaseyo"; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(source), "ko-KR"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an video file without passing in the source language. - * The source language of the file is ko-KR (Korean) - */ - @Test - public void testAmazonTranscribeUnknownVideo_koKR() { - String source = "src/test/resources/ko-KR_(Annyeonghaseyo).mp4"; - //TODO: Check whether output is Annyeonghaseyo or 안녕하세요 - String expected = "Annyeonghaseyo"; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(source)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file given the source language - * The source language of the file is pt-BR (Portuguese - Brazil) - */ - @Test - public void testAmazonTranscribeAudio_ptBR() { - String audioFilePath = "src/test/resources/pt-BR_(We_Are_At_School).mp3"; - String expected = "nós estamos na escola."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath), "pt-BR"); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - - /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is pt-BR (Portuguese - Brazil) - */ - @Test - public void testAmazonTranscribeUnknownAudio_ptBR() { - String audioFilePath = "src/test/resources/pt-BR_(We_Are_At_School).mp3"; - String expected = "nós estamos na escola."; - String result; - - if (transcriber.isAvailable()) { - try { - result = transcriber.transcribe(new FileInputStream(audioFilePath)); - assertNotNull(result); - assertEquals("Result: [" + result - + "]: not equal to expected: [" + expected + "]", - expected, result); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - } - } - -} diff --git a/tika-transcribe/src/test/resources/ShortAudioSampleFrench.mp3 b/tika-transcribe/src/test/resources/ShortAudioSampleFrench.mp3 deleted file mode 100644 index a718047..0000000 Binary files a/tika-transcribe/src/test/resources/ShortAudioSampleFrench.mp3 and /dev/null differ diff --git a/tika-transcribe/src/test/resources/de-DE_(We_Are_At_School_x2).mp3 b/tika-transcribe/src/test/resources/de-DE_(We_Are_At_School_x2).mp3 deleted file mode 100644 index 9d4df04..0000000 Binary files a/tika-transcribe/src/test/resources/de-DE_(We_Are_At_School_x2).mp3 and /dev/null differ diff --git a/tika-transcribe/src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3 b/tika-transcribe/src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3 deleted file mode 100644 index 16f840d..0000000 Binary files a/tika-transcribe/src/test/resources/en-AU_(A_Little_Bottle_Of_Water).mp3 and /dev/null differ diff --git a/tika-transcribe/src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3 b/tika-transcribe/src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3 deleted file mode 100644 index 2c6ae35..0000000 Binary files a/tika-transcribe/src/test/resources/en-GB_(A_Little_Bottle_Of_Water).mp3 and /dev/null differ diff --git a/tika-transcribe/src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3 b/tika-transcribe/src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3 deleted file mode 100644 index 3d69b68..0000000 Binary files a/tika-transcribe/src/test/resources/en-US_(A_Little_Bottle_Of_Water).mp3 and /dev/null differ diff --git a/tika-transcribe/src/test/resources/en-US_(Hi).mp4 b/tika-transcribe/src/test/resources/en-US_(Hi).mp4 deleted file mode 100644 index d697b13..0000000 Binary files a/tika-transcribe/src/test/resources/en-US_(Hi).mp4 and /dev/null differ diff --git a/tika-transcribe/src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3 b/tika-transcribe/src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3 deleted file mode 100644 index 5fa69c3..0000000 Binary files a/tika-transcribe/src/test/resources/it-IT_(We_Are_Having_Class_x2).mp3 and /dev/null differ diff --git a/tika-transcribe/src/test/resources/ja-JP_(We_Are_At_School).mp3 b/tika-transcribe/src/test/resources/ja-JP_(We_Are_At_School).mp3 deleted file mode 100644 index 5ddf6e5..0000000 Binary files a/tika-transcribe/src/test/resources/ja-JP_(We_Are_At_School).mp3 and /dev/null differ diff --git a/tika-transcribe/src/test/resources/ko-KR_(Annyeonghaseyo).mp4 b/tika-transcribe/src/test/resources/ko-KR_(Annyeonghaseyo).mp4 deleted file mode 100644 index d757d42..0000000 Binary files a/tika-transcribe/src/test/resources/ko-KR_(Annyeonghaseyo).mp4 and /dev/null differ diff --git a/tika-transcribe/src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3 b/tika-transcribe/src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3 deleted file mode 100644 index 444098c..0000000 Binary files a/tika-transcribe/src/test/resources/ko-KR_(We_Are_Having_Class_x2).mp3 and /dev/null differ diff --git a/tika-transcribe/src/test/resources/pt-BR_(We_Are_At_School).mp3 b/tika-transcribe/src/test/resources/pt-BR_(We_Are_At_School).mp3 deleted file mode 100644 index 7dfc811..0000000 Binary files a/tika-transcribe/src/test/resources/pt-BR_(We_Are_At_School).mp3 and /dev/null differ
