http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-example/src/test/java/org/apache/tika/example/LanguageDetectorExampleTest.java ---------------------------------------------------------------------- diff --git a/tika-example/src/test/java/org/apache/tika/example/LanguageDetectorExampleTest.java b/tika-example/src/test/java/org/apache/tika/example/LanguageDetectorExampleTest.java new file mode 100644 index 0000000..ec183c6 --- /dev/null +++ b/tika-example/src/test/java/org/apache/tika/example/LanguageDetectorExampleTest.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.example; + +import java.io.IOException; + +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class LanguageDetectorExampleTest { + LanguageDetectorExample languageDetectorExample; + @Before + public void setUp() { + languageDetectorExample = new LanguageDetectorExample(); + } + + @Test + public void testDetectLanguage() throws IOException { + String text = "This is some text that should be identified as English."; + assertEquals("en", languageDetectorExample.detectLanguage(text)); + } +}
http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-example/src/test/java/org/apache/tika/example/LanguageIdentifierExampleTest.java ---------------------------------------------------------------------- diff --git a/tika-example/src/test/java/org/apache/tika/example/LanguageIdentifierExampleTest.java b/tika-example/src/test/java/org/apache/tika/example/LanguageIdentifierExampleTest.java deleted file mode 100644 index 2a1717e..0000000 --- a/tika-example/src/test/java/org/apache/tika/example/LanguageIdentifierExampleTest.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.example; - -import org.junit.Before; -import org.junit.Test; - -import static org.junit.Assert.assertEquals; - -public class LanguageIdentifierExampleTest { - LanguageIdentifierExample languageIdentifierExample; - @Before - public void setUp() { - languageIdentifierExample = new LanguageIdentifierExample(); - } - - @Test - public void testIdentifyLanguage() { - String text = "This is some text that should be identified as English."; - assertEquals("en", languageIdentifierExample.identifyLanguage(text)); - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageConfidence.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageConfidence.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageConfidence.java index 9c689fa..af65d40 100644 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageConfidence.java +++ b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageConfidence.java @@ -4,5 +4,6 @@ public enum LanguageConfidence { HIGH, MEDIUM, - LOW + LOW, + NONE // Special value when no language is detected } http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageDetector.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageDetector.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageDetector.java index 62592db..e97581a 100644 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageDetector.java +++ b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageDetector.java @@ -150,14 +150,15 @@ public abstract class LanguageDetector { /** * Detect languages based on previously submitted text (via addText calls). * - * @return list of all possible languages with at least medium confidence, - * sorted by confidence from highest to lowest. + * @return list of all possible languages with at least medium confidence, + * sorted by confidence from highest to lowest. There will always + * be at least one result, which might have a confidence of NONE. */ public abstract List<LanguageResult> detectAll(); public LanguageResult detect() { List<LanguageResult> results = detectAll(); - return results.isEmpty() ? null : results.get(0); + return results.get(0); } /** http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageHandler.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageHandler.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageHandler.java index f317950..631e1ee 100644 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageHandler.java +++ b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageHandler.java @@ -16,6 +16,8 @@ */ package org.apache.tika.langdetect; +import java.io.IOException; + import org.apache.tika.sax.WriteOutContentHandler; /** @@ -28,6 +30,10 @@ public class LanguageHandler extends WriteOutContentHandler { private final LanguageWriter writer; + public LanguageHandler() throws IOException { + this(new LanguageWriter(new OptimaizeLangDetector().loadModels())); + } + public LanguageHandler(LanguageWriter writer) { super(writer); @@ -49,4 +55,12 @@ public class LanguageHandler extends WriteOutContentHandler { return writer.getDetector(); } + /** + * Returns the detected language based on text handled thus far. + * + * @return LanguageResult + */ + public LanguageResult getLanguage() { + return writer.getLanguage(); + } } http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageResult.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageResult.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageResult.java index e02b1bd..82a05c8 100644 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageResult.java +++ b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageResult.java @@ -4,6 +4,9 @@ import java.util.Locale; public class LanguageResult { + // A result that indicates no match. Used when no language was detected. + public static final LanguageResult NULL = new LanguageResult("", LanguageConfidence.NONE, 0.0f); + private String language; private LanguageConfidence confidence; @@ -39,6 +42,10 @@ public class LanguageResult { return confidence == LanguageConfidence.HIGH; } + public boolean isUnknown() { + return confidence == LanguageConfidence.NONE; + } + /** * Return true if the target language matches the detected language. We consider * it a match if, for the precision requested or detected, it matches. This means: http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageWriter.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageWriter.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageWriter.java index 18026b2..8bd47cc 100644 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageWriter.java +++ b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageWriter.java @@ -44,6 +44,15 @@ public class LanguageWriter extends Writer { return detector; } + /** + * Returns the detected language based on text written thus far. + * + * @return LanguageResult + */ + public LanguageResult getLanguage() { + return detector.detect(); + } + @Override public void write(char[] cbuf, int off, int len) { detector.addText(cbuf, off, len); http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java index 4aa93c1..4bd8a21 100644 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java +++ b/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java @@ -83,10 +83,10 @@ public class OptimaizeLangDetector extends LanguageDetector { } private com.optimaize.langdetect.LanguageDetector createDetector(List<LanguageProfile> languageProfiles) { - // FUTURE decide whether we really want to use the short text algorithm when dealing with mixed languages, - // as that would get really, really slow for big chunks of text. + // FUTURE currently the short text algorithm doesn't normalize probabilities until the end, which + // means you can often get 0 probabilities. So we pick a very short length for this limit. LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard()) - .shortTextAlgorithm(mixedLanguages ? Integer.MAX_VALUE : 100) + .shortTextAlgorithm(30) .withProfiles(languageProfiles); if (languageProbabilities != null) { @@ -149,6 +149,10 @@ public class OptimaizeLangDetector extends LanguageDetector { result.add(new LanguageResult(makeLanguageName(rawResult.getLocale()), confidence, (float)rawResult.getProbability())); } + if (result.isEmpty()) { + result.add(LanguageResult.NULL); + } + return result; } http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java b/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java index a95c6a1..3997cdf 100644 --- a/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java +++ b/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java @@ -196,11 +196,11 @@ public class OptimaizeLangDetectorTest extends LanguageDetectorTest { // First verify that we get no result with empty or very short text. LanguageWriter writer = new LanguageWriter(detector); writer.append(""); - assertNull(detector.detect()); + assertEquals(LanguageConfidence.NONE, detector.detect().getConfidence()); writer.reset(); writer.append(" "); - assertNull(detector.detect()); + assertEquals(LanguageConfidence.NONE, detector.detect().getConfidence()); for (String language : getTestLanguages()) { // Short pieces of Japanese are detected as Chinese http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java index f9c8db1..847c101 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java @@ -17,21 +17,22 @@ package org.apache.tika.server.resource; +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.IOException; +import java.io.InputStream; + import javax.ws.rs.Consumes; import javax.ws.rs.POST; import javax.ws.rs.PUT; import javax.ws.rs.Path; import javax.ws.rs.Produces; -import java.io.IOException; -import java.io.InputStream; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.tika.language.LanguageIdentifier; -import org.apache.tika.language.LanguageProfile; - -import static java.nio.charset.StandardCharsets.UTF_8; +import org.apache.tika.langdetect.LanguageResult; +import org.apache.tika.langdetect.OptimaizeLangDetector; @Path("/language") public class LanguageResource { @@ -45,13 +46,9 @@ public class LanguageResource { @Consumes("*/*") @Produces("text/plain") public String detect(final InputStream is) throws IOException { - // comme çi comme ça - // this is English! String fileTxt = IOUtils.toString(is, UTF_8); - logger.debug("File: " + fileTxt); - LanguageIdentifier lang = new LanguageIdentifier(new LanguageProfile( - fileTxt)); - String detectedLang = lang.getLanguage(); + LanguageResult language = new OptimaizeLangDetector().loadModels().detect(fileTxt); + String detectedLang = language.getLanguage(); logger.info("Detecting language for incoming resource: [" + detectedLang + "]"); return detectedLang; @@ -63,10 +60,8 @@ public class LanguageResource { @Consumes("*/*") @Produces("text/plain") public String detect(final String string) throws IOException { - logger.debug("String: " + string); - LanguageIdentifier lang = new LanguageIdentifier(new LanguageProfile( - string)); - String detectedLang = lang.getLanguage(); + LanguageResult language = new OptimaizeLangDetector().loadModels().detect(string); + String detectedLang = language.getLanguage(); logger.info("Detecting language for incoming resource: [" + detectedLang + "]"); return detectedLang; http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java index 5a7ecf4..1fb8385 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java @@ -17,6 +17,9 @@ package org.apache.tika.server.resource; +import java.io.IOException; +import java.io.InputStream; + import javax.ws.rs.Consumes; import javax.ws.rs.POST; import javax.ws.rs.PUT; @@ -28,13 +31,11 @@ import javax.ws.rs.core.HttpHeaders; import javax.ws.rs.core.MultivaluedMap; import javax.ws.rs.core.Response; import javax.ws.rs.core.UriInfo; -import java.io.IOException; -import java.io.InputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.cxf.jaxrs.ext.multipart.Attachment; -import org.apache.tika.language.ProfilingHandler; +import org.apache.tika.langdetect.LanguageHandler; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; @@ -124,7 +125,7 @@ public class MetadataResource { TikaResource.fillParseContext(context, httpHeaders, null); TikaResource.logRequest(logger, info, metadata); TikaResource.parse(parser, logger, info.getPath(), is, - new ProfilingHandler() { + new LanguageHandler() { public void endDocument() { metadata.set("language", getLanguage().getLanguage()); }}, http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java index 6c44755..57443b5 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java @@ -17,6 +17,8 @@ package org.apache.tika.server.resource; +import java.io.InputStream; + import javax.ws.rs.Consumes; import javax.ws.rs.POST; import javax.ws.rs.PUT; @@ -28,12 +30,11 @@ import javax.ws.rs.core.HttpHeaders; import javax.ws.rs.core.MultivaluedMap; import javax.ws.rs.core.Response; import javax.ws.rs.core.UriInfo; -import java.io.InputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.cxf.jaxrs.ext.multipart.Attachment; -import org.apache.tika.language.ProfilingHandler; +import org.apache.tika.langdetect.LanguageHandler; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; @@ -136,7 +137,7 @@ public class RecursiveMetadataResource { TikaResource.fillParseContext(context, httpHeaders, null); TikaResource.logRequest(logger, info, metadata); TikaResource.parse(wrapper, logger, info.getPath(), is, - new ProfilingHandler() { + new LanguageHandler() { public void endDocument() { metadata.set("language", getLanguage().getLanguage()); } http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java index 0f65f5d..284bb5b 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java @@ -17,15 +17,18 @@ package org.apache.tika.server.resource; +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + import javax.ws.rs.Consumes; import javax.ws.rs.POST; import javax.ws.rs.PUT; import javax.ws.rs.Path; import javax.ws.rs.PathParam; import javax.ws.rs.Produces; -import java.io.IOException; -import java.io.InputStream; -import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; @@ -33,12 +36,11 @@ import org.apache.commons.logging.LogFactory; import org.apache.tika.config.LoadErrorHandler; import org.apache.tika.config.ServiceLoader; import org.apache.tika.exception.TikaException; -import org.apache.tika.language.LanguageIdentifier; -import org.apache.tika.language.LanguageProfile; +import org.apache.tika.langdetect.LanguageConfidence; +import org.apache.tika.langdetect.LanguageResult; +import org.apache.tika.langdetect.OptimaizeLangDetector; import org.apache.tika.language.translate.Translator; -import static java.nio.charset.StandardCharsets.UTF_8; - @Path("/translate") public class TranslateResource { @@ -77,8 +79,11 @@ public class TranslateResource { @PathParam("translator") String translator, @PathParam("dest") String dLang) throws TikaException, IOException { final String content = IOUtils.toString(is, UTF_8); - LanguageIdentifier language = new LanguageIdentifier( - new LanguageProfile(content)); + LanguageResult language = new OptimaizeLangDetector().loadModels().detect(content); + if (language.isUnknown()) { + throw new TikaException("Unable to detect language to use for translation of text"); + } + String sLang = language.getLanguage(); logger.info("LanguageIdentifier: detected source lang: [" + sLang + "]"); return doTranslate(content, translator, sLang, dLang); http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java ---------------------------------------------------------------------- diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java new file mode 100644 index 0000000..2ff140e --- /dev/null +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java @@ -0,0 +1,16 @@ +package org.apache.tika.language.translate; + +import java.io.IOException; + +import org.apache.tika.langdetect.LanguageDetector; +import org.apache.tika.langdetect.LanguageResult; +import org.apache.tika.langdetect.OptimaizeLangDetector; + + +public abstract class AbstractTranslator implements Translator { + + protected LanguageResult detectLanguage(String text) throws IOException { + LanguageDetector detector = new OptimaizeLangDetector().loadModels(); + return detector.detect(text); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java ---------------------------------------------------------------------- diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java index e561f0b..f175681 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java @@ -21,15 +21,14 @@ import java.io.IOException; import java.util.HashMap; import org.apache.tika.exception.TikaException; -import org.apache.tika.language.LanguageIdentifier; -import org.apache.tika.language.LanguageProfile; +import org.apache.tika.langdetect.LanguageResult; import com.fasterxml.jackson.databind.util.LRUMap; /** * CachedTranslator. Saves a map of previous translations in order to prevent repetitive translation requests. */ -public class CachedTranslator implements Translator { +public class CachedTranslator extends AbstractTranslator { private static final int INITIAL_ENTRIES = 100; private static final int MAX_ENTRIES = 1000; private Translator translator; @@ -86,8 +85,7 @@ public class CachedTranslator implements Translator { @Override public String translate(String text, String targetLanguage) throws TikaException, IOException { - LanguageIdentifier language = new LanguageIdentifier( - new LanguageProfile(text)); + LanguageResult language = detectLanguage(text); String sourceLanguage = language.getLanguage(); return translate(text, sourceLanguage, targetLanguage); } @@ -149,10 +147,14 @@ public class CachedTranslator implements Translator { * @return true if the cache contains a translation of the text, false otherwise. */ public boolean contains(String text, String targetLanguage) { - LanguageIdentifier language = new LanguageIdentifier( - new LanguageProfile(text)); - String sourceLanguage = language.getLanguage(); - return contains(text, sourceLanguage, targetLanguage); + try { + LanguageResult language = detectLanguage(text); + String sourceLanguage = language.getLanguage(); + return contains(text, sourceLanguage, targetLanguage); + } catch (IOException e) { + // TODO what to do if we get an error? + return false; + } } /** http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-translate/src/main/java/org/apache/tika/language/translate/ExternalTranslator.java ---------------------------------------------------------------------- diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/ExternalTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/ExternalTranslator.java index 0b04cf8..725d94c 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/ExternalTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/ExternalTranslator.java @@ -17,17 +17,14 @@ package org.apache.tika.language.translate; -import org.apache.tika.exception.TikaException; -import org.apache.tika.language.LanguageIdentifier; -import org.apache.tika.language.LanguageProfile; - import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; -import java.util.Locale; + +import org.apache.tika.exception.TikaException; /** * Abstract class used to interact with command line/external Translators. @@ -36,7 +33,7 @@ import java.util.Locale; * * @since Tika 1.7 */ -public abstract class ExternalTranslator implements Translator { +public abstract class ExternalTranslator extends AbstractTranslator { /** * Run the given command and return the output written to standard out. @@ -93,9 +90,7 @@ public abstract class ExternalTranslator implements Translator { */ @Override public String translate(String text, String targetLanguage) throws TikaException, IOException { - LanguageIdentifier language = new LanguageIdentifier( - new LanguageProfile(text)); - String sourceLanguage = language.getLanguage(); + String sourceLanguage = detectLanguage(text).getLanguage(); return translate(text, sourceLanguage, targetLanguage); } } http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java ---------------------------------------------------------------------- diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java index ac84879..29c03c6 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java @@ -17,8 +17,7 @@ package org.apache.tika.language.translate; -import javax.ws.rs.core.MediaType; -import javax.ws.rs.core.Response; +import static java.nio.charset.StandardCharsets.UTF_8; import java.io.BufferedReader; import java.io.IOException; @@ -27,14 +26,14 @@ import java.io.InputStreamReader; import java.util.Properties; import java.util.logging.Logger; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; + import org.apache.cxf.jaxrs.client.WebClient; import org.apache.tika.exception.TikaException; -import org.apache.tika.language.LanguageIdentifier; -import org.apache.tika.language.LanguageProfile; -import static java.nio.charset.StandardCharsets.UTF_8; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; /** * An implementation of a REST client to the <a @@ -46,7 +45,7 @@ import static java.nio.charset.StandardCharsets.UTF_8; * * */ -public class GoogleTranslator implements Translator { +public class GoogleTranslator extends AbstractTranslator { private static final String GOOGLE_TRANSLATE_URL_BASE = "https://www.googleapis.com/language/translate/v2"; @@ -104,9 +103,8 @@ public class GoogleTranslator implements Translator { throws TikaException, IOException { if (!this.isAvailable) return text; - LanguageIdentifier language = new LanguageIdentifier( - new LanguageProfile(text)); - String sourceLanguage = language.getLanguage(); + + String sourceLanguage = detectLanguage(text).getLanguage(); return translate(text, sourceLanguage, targetLanguage); } http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-translate/src/main/java/org/apache/tika/language/translate/Lingo24Translator.java ---------------------------------------------------------------------- diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/Lingo24Translator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/Lingo24Translator.java index 56389ba..22589d9 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/Lingo24Translator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/Lingo24Translator.java @@ -17,8 +17,7 @@ package org.apache.tika.language.translate; -import javax.ws.rs.core.MediaType; -import javax.ws.rs.core.Response; +import static java.nio.charset.StandardCharsets.UTF_8; import java.io.BufferedReader; import java.io.IOException; @@ -26,14 +25,14 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.util.Properties; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; + import org.apache.cxf.jaxrs.client.WebClient; import org.apache.tika.exception.TikaException; -import org.apache.tika.language.LanguageIdentifier; -import org.apache.tika.language.LanguageProfile; -import static java.nio.charset.StandardCharsets.UTF_8; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; /** * An implementation of a REST client for the @@ -41,7 +40,7 @@ import static java.nio.charset.StandardCharsets.UTF_8; * You can sign up for an access plan online on the <a href="https://developer.lingo24.com/plans">Lingo24 Developer Portal</a> * and set your Application's User Key in the <code>translator.lingo24.properties</code> file. */ -public class Lingo24Translator implements Translator { +public class Lingo24Translator extends AbstractTranslator { private static final String LINGO24_TRANSLATE_URL_BASE = "https://api.lingo24.com/mt/v1/translate"; @@ -100,9 +99,8 @@ public class Lingo24Translator implements Translator { throws TikaException, IOException { if (!this.isAvailable) return text; - LanguageIdentifier language = new LanguageIdentifier( - new LanguageProfile(text)); - String sourceLanguage = language.getLanguage(); + + String sourceLanguage = detectLanguage(text).getLanguage(); return translate(text, sourceLanguage, targetLanguage); } http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java ---------------------------------------------------------------------- diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java index 6ff5dca..8a976fe 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java @@ -17,21 +17,18 @@ package org.apache.tika.language.translate; -import org.apache.tika.exception.TikaException; - import java.io.BufferedReader; -import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; -import java.io.FileReader; -import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.nio.charset.Charset; import java.util.Properties; +import org.apache.tika.exception.TikaException; + /** * Translator that uses the Moses decoder for translation. * Users must install the Moses system before using this Translator. @link http://www.statmt.org/moses/.
