Move base lang detect classes to core
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f9113be5 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f9113be5 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f9113be5 Branch: refs/heads/master Commit: f9113be57db9607c0e3710b8560989e0f8f8efef Parents: 3a7a94c Author: Ken Krugler <[email protected]> Authored: Wed Feb 24 15:28:16 2016 -0800 Committer: trevorlewis <[email protected]> Committed: Mon Mar 7 11:44:08 2016 -0800 ---------------------------------------------------------------------- .../main/java/org/apache/tika/cli/TikaCLI.java | 2 +- .../language/detect/LanguageConfidence.java | 9 + .../tika/language/detect/LanguageDetector.java | 224 +++++++++++++++++++ .../tika/language/detect/LanguageHandler.java | 66 ++++++ .../tika/language/detect/LanguageNames.java | 70 ++++++ .../tika/language/detect/LanguageResult.java | 82 +++++++ .../tika/language/detect/LanguageWriter.java | 78 +++++++ .../tika/language/detect/LanguageNamesTest.java | 22 ++ .../java/org/apache/tika/example/Language.java | 8 +- .../tika/example/LanguageDetectingParser.java | 5 +- .../tika/example/LanguageDetectorExample.java | 4 +- .../org/apache/tika/example/MyFirstTika.java | 5 +- .../tika/langdetect/LanguageConfidence.java | 9 - .../tika/langdetect/LanguageDetector.java | 183 --------------- .../apache/tika/langdetect/LanguageHandler.java | 66 ------ .../apache/tika/langdetect/LanguageNames.java | 70 ------ .../apache/tika/langdetect/LanguageResult.java | 82 ------- .../apache/tika/langdetect/LanguageWriter.java | 78 ------- .../tika/langdetect/OptimaizeLangDetector.java | 5 + .../tika/langdetect/LanguageDetectorTest.java | 2 +- .../tika/langdetect/LanguageNamesTest.java | 22 -- .../langdetect/OptimaizeLangDetectorTest.java | 4 + .../tika/server/resource/LanguageResource.java | 2 +- .../tika/server/resource/MetadataResource.java | 2 +- .../resource/RecursiveMetadataResource.java | 2 +- .../tika/server/resource/TranslateResource.java | 3 +- .../language/translate/AbstractTranslator.java | 4 +- .../language/translate/CachedTranslator.java | 2 +- 28 files changed, 579 insertions(+), 532 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java ---------------------------------------------------------------------- diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 3efe0f7..ef8045e 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -83,7 +83,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.fork.ForkParser; import org.apache.tika.gui.TikaGUI; import org.apache.tika.io.TikaInputStream; -import org.apache.tika.langdetect.LanguageHandler; +import org.apache.tika.language.detect.LanguageHandler; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.serialization.JsonMetadata; import org.apache.tika.metadata.serialization.JsonMetadataList; http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java new file mode 100644 index 0000000..fcd4485 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java @@ -0,0 +1,9 @@ +package org.apache.tika.language.detect; + +public enum LanguageConfidence { + + HIGH, + MEDIUM, + LOW, + NONE // Special value when no language is detected +} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java new file mode 100644 index 0000000..ee6100d --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java @@ -0,0 +1,224 @@ +package org.apache.tika.language.detect; + +import java.io.IOException; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.tika.config.ServiceLoader; +import org.apache.tika.language.translate.Translator; + +// We should use the IANA registry for primary language names...see +// http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry +// There must be a package that uses this dataset to support knowledge of +// the default script, etc. And how to map from <lang>-<country> (e.g. 'zh-CN') +// to <sublang> ('cmn'), or <lang>-<sublang> to <sublan> ('zh-cmn' => 'cmn') +// We'd also want to know the default sublang for a macro language ('zh' => 'zh-cmn') +// There's also mapping 'zh-CN' to 'cmn-Hans' (simplified chinese script) + +// TODO decide how deep to go into supporting extended language tags, see +// http://www.w3.org/International/articles/language-tags/. For example, +// what should you expect from calling hasModel("en-GB") if there's only +// a model for "en"? + +// This is mostly an issue for interpreting language tags in (X)HTML docs, +// and maybe XML if we really care. In those cases you could get something +// like "ast" (three letter language code), or even zh-cmn-Hant-SG +// (Chinese, Mandarin, Traditional script, in Singapore) plus additional: +// language-extlang-script-region-variant-extension-privateuse + +// The full spec is at http://www.rfc-editor.org/rfc/bcp/bcp47.txt + +public abstract class LanguageDetector { + + private static final ServiceLoader DEFAULT_SERVICE_LOADER = new ServiceLoader(); + + // True if text is expected to be a mix of languages, and thus higher-resolution + // detection must be done to avoid under-sampling the text. + protected boolean mixedLanguages = false; + + // True if the text is expected to be 'short' (typically less than 100 chars), and + // thus a different algorithm and/or set of profiles should be used. + protected boolean shortText = false; + + public static LanguageDetector getDefaultLanguageDetector() { + List<LanguageDetector> detectors = getLanguageDetectors(); + if (detectors.isEmpty()) { + throw new IllegalStateException("No language detectors available"); + } else { + return detectors.get(0); + } + } + + public static List<LanguageDetector> getLanguageDetectors() { + return getLanguageDetectors(DEFAULT_SERVICE_LOADER); + } + + public static List<LanguageDetector> getLanguageDetectors(ServiceLoader loader) { + List<LanguageDetector> detectors = loader.loadStaticServiceProviders(LanguageDetector.class); + Collections.sort(detectors, new Comparator<LanguageDetector>() { + public int compare(LanguageDetector d1, LanguageDetector d2) { + String n1 = d1.getClass().getName(); + String n2 = d2.getClass().getName(); + boolean tika1 = n1.startsWith("org.apache.tika."); + boolean tika2 = n2.startsWith("org.apache.tika."); + if (tika1 == tika2) { + return n1.compareTo(n2); + } else if (tika1) { + return -1; + } else { + return 1; + } + } + }); + + return detectors; + } + + public boolean isMixedLanguages() { + return mixedLanguages; + } + + public LanguageDetector setMixedLanguages(boolean mixedLanguages) { + this.mixedLanguages = mixedLanguages; + return this; + } + + public boolean isShortText() { + return shortText; + } + + public LanguageDetector setShortText(boolean shortText) { + this.shortText = shortText; + return this; + } + + /** + * Load (or re-load) all available language models. This must + * be called after any settings that would impact the models + * being loaded (e.g. mixed language/short text), but + * before any of the document processing routines (below) + * are called. Note that it only needs to be called once. + * + * @return this + */ + public abstract LanguageDetector loadModels() throws IOException; + + /** + * Load (or re-load) the models specified in <languages>. These use the + * ISO 639-1 names, with an optional "-<country code>" for more + * specific specification (e.g. "zh-CN" for Chinese in China). + * + * @param languages list of target languages. + * @return this + */ + public abstract LanguageDetector loadModels(Set<String> languages) throws IOException; + + /** + * Provide information about whether a model exists for a specific + * language. + * + * @param language ISO 639-1 name for language + * @return true if a model for this language exists. + */ + public abstract boolean hasModel(String language); + + /** + * Set the a-priori probabilities for these languages. The provided map uses the language + * as the key, and the probability (0.0 > probability < 1.0) of text being in that language. + * Note that if the probabilities don't sum to 1.0, these values will be normalized. + * + * If hasModel() returns false for any of the languages, an IllegalArgumentException is thrown. + * + * Use of these probabilities is detector-specific, and thus might not impact the results at all. + * As such, these should be viewed as a hint. + * + * @param languageProbabilities Map from language to probability + * @return this + */ + public abstract LanguageDetector setPriors(Map<String, Float> languageProbabilities) throws IOException; + + // ============================================================ + // The routines below are called when processing a document + // ============================================================ + + /** + * Reset statistics about the current document being processed + */ + public abstract void reset(); + + /** + * Add statistics about this text for the current document. Note + * that we assume an implicit word break exists before/after + * each of these runs of text. + * + * @param cbuf Character buffer + * @param off Offset into cbuf to first character in the run of text + * @param len Number of characters in the run of text. + */ + public abstract void addText(char[] cbuf, int off, int len); + + /** + * Add <text> to the statistics being accumulated for the current + * document. Note that this is a default implementation for adding + * a string (not optimized) + * + * @param text Characters to add to current statistics. + */ + public void addText(CharSequence text) { + char[] chars = text.toString().toCharArray(); + addText(chars, 0, chars.length); + } + + + /** + * Tell the caller whether more text is required for the current document + * before the language can be reliably detected. + * + * Implementations can override this to do early termination of stats + * collection, which can improve performance with longer documents. + * + * Note that detect() can be called even when this returns false + * + * @return true if we have enough text for reliable detection. + */ + public boolean hasEnoughText() { + return false; + } + + /** + * Detect languages based on previously submitted text (via addText calls). + * + * @return list of all possible languages with at least medium confidence, + * sorted by confidence from highest to lowest. There will always + * be at least one result, which might have a confidence of NONE. + */ + public abstract List<LanguageResult> detectAll(); + + public LanguageResult detect() { + List<LanguageResult> results = detectAll(); + return results.get(0); + } + + /** + * Utility wrapper that detects the language of a given chunk of text. + * + * @param text String to add to current statistics. + * @return list of all possible languages with at least medium confidence, + * sorted by confidence from highest to lowest. + */ + public List<LanguageResult> detectAll(String text) { + reset(); + addText(text); + return detectAll(); + } + + public LanguageResult detect(CharSequence text) { + reset(); + addText(text); + return detect(); + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java new file mode 100644 index 0000000..673b4db --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.language.detect; + +import java.io.IOException; + +import org.apache.tika.sax.WriteOutContentHandler; + +/** + * SAX content handler that updates a language detector based on all the + * received character content. + * + * @since Apache Tika 0.10 + */ +public class LanguageHandler extends WriteOutContentHandler { + + private final LanguageWriter writer; + + public LanguageHandler() throws IOException { + this(new LanguageWriter(LanguageDetector.getDefaultLanguageDetector().loadModels())); + } + + public LanguageHandler(LanguageWriter writer) { + super(writer); + + this.writer = writer; + } + + public LanguageHandler(LanguageDetector detector) { + this(new LanguageWriter(detector)); + } + + /** + * Returns the language detector used by this content handler. + * Note that the returned detector gets updated whenever new SAX events + * are received by this content handler. + * + * @return language detector + */ + public LanguageDetector getDetector() { + return writer.getDetector(); + } + + /** + * Returns the detected language based on text handled thus far. + * + * @return LanguageResult + */ + public LanguageResult getLanguage() { + return writer.getLanguage(); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java new file mode 100644 index 0000000..d659753 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java @@ -0,0 +1,70 @@ +package org.apache.tika.language.detect; + +import java.util.Locale; + +/** + * Support for language tags (as defined by https://tools.ietf.org/html/bcp47) + * + * See https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes for a list of + * three character language codes. + * + * TODO change to LanguageTag, and use these vs. strings everywhere in the + * language detector API? + * + */ +public class LanguageNames { + + public static String makeName(String language, String script, String region) { + Locale locale = new Locale.Builder().setLanguage(language).setScript(script).setRegion(region).build(); + return locale.toLanguageTag(); + } + + public static String normalizeName(String languageTag) { + Locale locale = Locale.forLanguageTag(languageTag); + return locale.toLanguageTag(); + } + + public static boolean isMacroLanguage(String languageTag) { + Locale locale = Locale.forLanguageTag(languageTag); + // TODO make it so. + return false; + } + + public static boolean hasMacroLanguage(String languageTag) { + Locale locale = Locale.forLanguageTag(languageTag); + // TODO make it so + return false; + } + + /** + * If language is a specific variant of a macro language (e.g. 'nb' for Norwegian Bokmal), + * return the macro language (e.g. 'no' for Norwegian). If it doesn't have a macro language, + * return unchanged. + * + * @param languageTag + * @return + */ + public static String getMacroLanguage(String languageTag) { + // TODO make it so + return languageTag; + } + + public static boolean equals(String languageTagA, String languageTagB) { + Locale localeA = Locale.forLanguageTag(languageTagA); + Locale localeB = Locale.forLanguageTag(languageTagB); + + // TODO Fill in script if missing and something we could derive from lang+region + // e.g. zh-CN => zh-Hans-CN, zh-TW => zh-Hant-TW. + + // TODO Treat missing script == present script, if present script is default (suppressed) for + // the language. So "en-Latn" == "en" + + // TODO probably OK to ignore extensions + + // TODO Do we want/need a fuzzy match for region (and script) + // E.g. are 'en' and 'en-GB' equal? Depends on the direction, e.g. if you want 'en', and + // you get back something more specific (en-GB) then that's OK, but if you explicitly want + // en-GB and you get back en then that might not be OK. + return localeA.equals(localeB); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java new file mode 100644 index 0000000..6952ae9 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java @@ -0,0 +1,82 @@ +package org.apache.tika.language.detect; + +import java.util.Locale; + +public class LanguageResult { + + // A result that indicates no match. Used when no language was detected. + public static final LanguageResult NULL = new LanguageResult("", LanguageConfidence.NONE, 0.0f); + + private String language; + + private LanguageConfidence confidence; + + // rawScore should be a number from 0.0 to 1.0, with higher values implying + // greater confidence. + private float rawScore; + + /** + * + * @param language ISO 639-1 language code (plus optional "-<country code>") + * @param rawScore confidence of detector in the result. + */ + public LanguageResult(String language, LanguageConfidence confidence, float rawScore) { + this.language = language; + this.confidence = confidence; + this.rawScore = rawScore; + } + + public String getLanguage() { + return language; + } + + public float getRawScore() { + return rawScore; + } + + public LanguageConfidence getConfidence() { + return confidence; + } + + public boolean isReasonablyCertain() { + return confidence == LanguageConfidence.HIGH; + } + + public boolean isUnknown() { + return confidence == LanguageConfidence.NONE; + } + + /** + * Return true if the target language matches the detected language. We consider + * it a match if, for the precision requested or detected, it matches. This means: + * + * target | detected | match? + * zh | en | false + * zh | zh | true + * zh | zh-CN | true + * zh-CN | zh | true + * zh-CN | zh-TW | false + * zh-CN | zh-cn | true (case-insensitive) + * + * @param language + * @return + */ + public boolean isLanguage(String language) { + String[] targetLanguage = language.split("\\-"); + String[] resultLanguage = this.language.split("\\-"); + + int minLength = Math.min(targetLanguage.length, resultLanguage.length); + for (int i = 0; i < minLength; i++) { + if (!targetLanguage[i].equalsIgnoreCase(resultLanguage[i])) { + return false; + } + } + + return true; + } + + @Override + public String toString() { + return String.format(Locale.US, "%s: %s (%f)", language, confidence, rawScore); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java new file mode 100644 index 0000000..7630990 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.language.detect; + +import java.io.IOException; +import java.io.Writer; + +/** + * Writer that builds a language profile based on all the written content. + * + * @since Apache Tika 0.10 + */ +public class LanguageWriter extends Writer { + + private final LanguageDetector detector; + + public LanguageWriter(LanguageDetector detector) { + this.detector = detector; + detector.reset(); + } + + /** + * Returns the language detector used by this writer. Note that + * the returned language detector gets updated whenever new characters + * are written. + * + * @return language detector + */ + public LanguageDetector getDetector() { + return detector; + } + + /** + * Returns the detected language based on text written thus far. + * + * @return LanguageResult + */ + public LanguageResult getLanguage() { + return detector.detect(); + } + + @Override + public void write(char[] cbuf, int off, int len) { + detector.addText(cbuf, off, len); + } + + /** + * Ignored. + */ + @Override + public void close() throws IOException { + } + + /** + * Ignored. + */ + @Override + public void flush() { + } + + public void reset() { + detector.reset(); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java b/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java new file mode 100644 index 0000000..4951670 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java @@ -0,0 +1,22 @@ +package org.apache.tika.language.detect; + +import static org.junit.Assert.*; + +import org.junit.Test; + +public class LanguageNamesTest { + + @Test + public void test() { + + // macro language + language == language + String languageA = LanguageNames.normalizeName("zh-yue"); + String languageB = LanguageNames.normalizeName("yue"); + assertTrue(LanguageNames.equals(languageA, languageB)); + + // TODO verify that "en-Latn" == "en" + + // TODO verify that "en-GB" == "en"??? + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-example/src/main/java/org/apache/tika/example/Language.java ---------------------------------------------------------------------- diff --git a/tika-example/src/main/java/org/apache/tika/example/Language.java b/tika-example/src/main/java/org/apache/tika/example/Language.java index 42dc608..ec14a58 100755 --- a/tika-example/src/main/java/org/apache/tika/example/Language.java +++ b/tika-example/src/main/java/org/apache/tika/example/Language.java @@ -19,11 +19,11 @@ package org.apache.tika.example; import java.io.IOException; -import org.apache.tika.langdetect.LanguageDetector; -import org.apache.tika.langdetect.LanguageHandler; -import org.apache.tika.langdetect.LanguageResult; -import org.apache.tika.langdetect.LanguageWriter; import org.apache.tika.langdetect.OptimaizeLangDetector; +import org.apache.tika.language.detect.LanguageDetector; +import org.apache.tika.language.detect.LanguageHandler; +import org.apache.tika.language.detect.LanguageResult; +import org.apache.tika.language.detect.LanguageWriter; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java ---------------------------------------------------------------------- diff --git a/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java b/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java index 9b67bd7..0ba8a6c 100755 --- a/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java +++ b/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java @@ -21,8 +21,8 @@ import java.io.IOException; import java.io.InputStream; import org.apache.tika.exception.TikaException; -import org.apache.tika.langdetect.LanguageHandler; -import org.apache.tika.langdetect.LanguageResult; +import org.apache.tika.language.detect.LanguageHandler; +import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.DelegatingParser; @@ -31,7 +31,6 @@ import org.apache.tika.sax.TeeContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -@SuppressWarnings("deprecation") public class LanguageDetectingParser extends DelegatingParser { private static final long serialVersionUID = 4291320409396502774L; http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java ---------------------------------------------------------------------- diff --git a/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java b/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java index d37208f..53e5c7a 100644 --- a/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java @@ -19,9 +19,9 @@ package org.apache.tika.example; import java.io.IOException; -import org.apache.tika.langdetect.LanguageDetector; -import org.apache.tika.langdetect.LanguageResult; import org.apache.tika.langdetect.OptimaizeLangDetector; +import org.apache.tika.language.detect.LanguageDetector; +import org.apache.tika.language.detect.LanguageResult; public class LanguageDetectorExample { http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java ---------------------------------------------------------------------- diff --git a/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java b/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java index 76ec039..fe0c8d9 100755 --- a/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java +++ b/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java @@ -26,10 +26,9 @@ import org.apache.commons.io.FileUtils; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; -import org.apache.tika.langdetect.LanguageDetector; -import org.apache.tika.langdetect.LanguageResult; -import org.apache.tika.langdetect.LanguageWriter; import org.apache.tika.langdetect.OptimaizeLangDetector; +import org.apache.tika.language.detect.LanguageDetector; +import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeTypes; http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageConfidence.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageConfidence.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageConfidence.java deleted file mode 100644 index af65d40..0000000 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageConfidence.java +++ /dev/null @@ -1,9 +0,0 @@ -package org.apache.tika.langdetect; - -public enum LanguageConfidence { - - HIGH, - MEDIUM, - LOW, - NONE // Special value when no language is detected -} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageDetector.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageDetector.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageDetector.java deleted file mode 100644 index e97581a..0000000 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageDetector.java +++ /dev/null @@ -1,183 +0,0 @@ -package org.apache.tika.langdetect; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.Set; - -// We should use the IANA registry for primary language names...see -// http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry -// There must be a package that uses this dataset to support knowledge of -// the default script, etc. And how to map from <lang>-<country> (e.g. 'zh-CN') -// to <sublang> ('cmn'), or <lang>-<sublang> to <sublan> ('zh-cmn' => 'cmn') -// We'd also want to know the default sublang for a macro language ('zh' => 'zh-cmn') -// There's also mapping 'zh-CN' to 'cmn-Hans' (simplified chinese script) - -// TODO decide how deep to go into supporting extended language tags, see -// http://www.w3.org/International/articles/language-tags/. For example, -// what should you expect from calling hasModel("en-GB") if there's only -// a model for "en"? - -// This is mostly an issue for interpreting language tags in (X)HTML docs, -// and maybe XML if we really care. In those cases you could get something -// like "ast" (three letter language code), or even zh-cmn-Hant-SG -// (Chinese, Mandarin, Traditional script, in Singapore) plus additional: -// language-extlang-script-region-variant-extension-privateuse - -// The full spec is at http://www.rfc-editor.org/rfc/bcp/bcp47.txt - -public abstract class LanguageDetector { - - // True if text is expected to be a mix of languages, and thus higher-resolution - // detection must be done to avoid under-sampling the text. - protected boolean mixedLanguages = false; - - // True if the text is expected to be 'short' (typically less than 100 chars), and - // thus a different algorithm and/or set of profiles should be used. - protected boolean shortText = false; - - public boolean isMixedLanguages() { - return mixedLanguages; - } - - public LanguageDetector setMixedLanguages(boolean mixedLanguages) { - this.mixedLanguages = mixedLanguages; - return this; - } - - public boolean isShortText() { - return shortText; - } - - public LanguageDetector setShortText(boolean shortText) { - this.shortText = shortText; - return this; - } - - /** - * Load (or re-load) all available language models. This must - * be called after any settings that would impact the models - * being loaded (e.g. mixed language/short text), but - * before any of the document processing routines (below) - * are called. Note that it only needs to be called once. - * - * @return this - */ - public abstract LanguageDetector loadModels() throws IOException; - - /** - * Load (or re-load) the models specified in <languages>. These use the - * ISO 639-1 names, with an optional "-<country code>" for more - * specific specification (e.g. "zh-CN" for Chinese in China). - * - * @param languages list of target languages. - * @return this - */ - public abstract LanguageDetector loadModels(Set<String> languages) throws IOException; - - /** - * Provide information about whether a model exists for a specific - * language. - * - * @param language ISO 639-1 name for language - * @return true if a model for this language exists. - */ - public abstract boolean hasModel(String language); - - /** - * Set the a-priori probabilities for these languages. The provided map uses the language - * as the key, and the probability (0.0 > probability < 1.0) of text being in that language. - * Note that if the probabilities don't sum to 1.0, these values will be normalized. - * - * If hasModel() returns false for any of the languages, an IllegalArgumentException is thrown. - * - * Use of these probabilities is detector-specific, and thus might not impact the results at all. - * As such, these should be viewed as a hint. - * - * @param languageProbabilities Map from language to probability - * @return this - */ - public abstract LanguageDetector setPriors(Map<String, Float> languageProbabilities) throws IOException; - - // ============================================================ - // The routines below are called when processing a document - // ============================================================ - - /** - * Reset statistics about the current document being processed - */ - public abstract void reset(); - - /** - * Add statistics about this text for the current document. Note - * that we assume an implicit word break exists before/after - * each of these runs of text. - * - * @param cbuf Character buffer - * @param off Offset into cbuf to first character in the run of text - * @param len Number of characters in the run of text. - */ - public abstract void addText(char[] cbuf, int off, int len); - - /** - * Add <text> to the statistics being accumulated for the current - * document. Note that this is a default implementation for adding - * a string (not optimized) - * - * @param text Characters to add to current statistics. - */ - public void addText(CharSequence text) { - char[] chars = text.toString().toCharArray(); - addText(chars, 0, chars.length); - } - - - /** - * Tell the caller whether more text is required for the current document - * before the language can be reliably detected. - * - * Implementations can override this to do early termination of stats - * collection, which can improve performance with longer documents. - * - * Note that detect() can be called even when this returns false - * - * @return true if we have enough text for reliable detection. - */ - public boolean hasEnoughText() { - return false; - } - - /** - * Detect languages based on previously submitted text (via addText calls). - * - * @return list of all possible languages with at least medium confidence, - * sorted by confidence from highest to lowest. There will always - * be at least one result, which might have a confidence of NONE. - */ - public abstract List<LanguageResult> detectAll(); - - public LanguageResult detect() { - List<LanguageResult> results = detectAll(); - return results.get(0); - } - - /** - * Utility wrapper that detects the language of a given chunk of text. - * - * @param text String to add to current statistics. - * @return list of all possible languages with at least medium confidence, - * sorted by confidence from highest to lowest. - */ - public List<LanguageResult> detectAll(String text) { - reset(); - addText(text); - return detectAll(); - } - - public LanguageResult detect(CharSequence text) { - reset(); - addText(text); - return detect(); - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageHandler.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageHandler.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageHandler.java deleted file mode 100644 index 631e1ee..0000000 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageHandler.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.langdetect; - -import java.io.IOException; - -import org.apache.tika.sax.WriteOutContentHandler; - -/** - * SAX content handler that updates a language detector based on all the - * received character content. - * - * @since Apache Tika 0.10 - */ -public class LanguageHandler extends WriteOutContentHandler { - - private final LanguageWriter writer; - - public LanguageHandler() throws IOException { - this(new LanguageWriter(new OptimaizeLangDetector().loadModels())); - } - - public LanguageHandler(LanguageWriter writer) { - super(writer); - - this.writer = writer; - } - - public LanguageHandler(LanguageDetector detector) { - this(new LanguageWriter(detector)); - } - - /** - * Returns the language detector used by this content handler. - * Note that the returned detector gets updated whenever new SAX events - * are received by this content handler. - * - * @return language detector - */ - public LanguageDetector getDetector() { - return writer.getDetector(); - } - - /** - * Returns the detected language based on text handled thus far. - * - * @return LanguageResult - */ - public LanguageResult getLanguage() { - return writer.getLanguage(); - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageNames.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageNames.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageNames.java deleted file mode 100644 index abed277..0000000 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageNames.java +++ /dev/null @@ -1,70 +0,0 @@ -package org.apache.tika.langdetect; - -import java.util.Locale; - -/** - * Support for language tags (as defined by https://tools.ietf.org/html/bcp47) - * - * See https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes for a list of - * three character language codes. - * - * TODO change to LanguageTag, and use these vs. strings everywhere in the - * language detector API? - * - */ -public class LanguageNames { - - public static String makeName(String language, String script, String region) { - Locale locale = new Locale.Builder().setLanguage(language).setScript(script).setRegion(region).build(); - return locale.toLanguageTag(); - } - - public static String normalizeName(String languageTag) { - Locale locale = Locale.forLanguageTag(languageTag); - return locale.toLanguageTag(); - } - - public static boolean isMacroLanguage(String languageTag) { - Locale locale = Locale.forLanguageTag(languageTag); - // TODO make it so. - return false; - } - - public static boolean hasMacroLanguage(String languageTag) { - Locale locale = Locale.forLanguageTag(languageTag); - // TODO make it so - return false; - } - - /** - * If language is a specific variant of a macro language (e.g. 'nb' for Norwegian Bokmal), - * return the macro language (e.g. 'no' for Norwegian). If it doesn't have a macro language, - * return unchanged. - * - * @param languageTag - * @return - */ - public static String getMacroLanguage(String languageTag) { - // TODO make it so - return languageTag; - } - - public static boolean equals(String languageTagA, String languageTagB) { - Locale localeA = Locale.forLanguageTag(languageTagA); - Locale localeB = Locale.forLanguageTag(languageTagB); - - // TODO Fill in script if missing and something we could derive from lang+region - // e.g. zh-CN => zh-Hans-CN, zh-TW => zh-Hant-TW. - - // TODO Treat missing script == present script, if present script is default (suppressed) for - // the language. So "en-Latn" == "en" - - // TODO probably OK to ignore extensions - - // TODO Do we want/need a fuzzy match for region (and script) - // E.g. are 'en' and 'en-GB' equal? Depends on the direction, e.g. if you want 'en', and - // you get back something more specific (en-GB) then that's OK, but if you explicitly want - // en-GB and you get back en then that might not be OK. - return localeA.equals(localeB); - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageResult.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageResult.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageResult.java deleted file mode 100644 index 82a05c8..0000000 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageResult.java +++ /dev/null @@ -1,82 +0,0 @@ -package org.apache.tika.langdetect; - -import java.util.Locale; - -public class LanguageResult { - - // A result that indicates no match. Used when no language was detected. - public static final LanguageResult NULL = new LanguageResult("", LanguageConfidence.NONE, 0.0f); - - private String language; - - private LanguageConfidence confidence; - - // rawScore should be a number from 0.0 to 1.0, with higher values implying - // greater confidence. - private float rawScore; - - /** - * - * @param language ISO 639-1 language code (plus optional "-<country code>") - * @param rawScore confidence of detector in the result. - */ - public LanguageResult(String language, LanguageConfidence confidence, float rawScore) { - this.language = language; - this.confidence = confidence; - this.rawScore = rawScore; - } - - public String getLanguage() { - return language; - } - - public float getRawScore() { - return rawScore; - } - - public LanguageConfidence getConfidence() { - return confidence; - } - - public boolean isReasonablyCertain() { - return confidence == LanguageConfidence.HIGH; - } - - public boolean isUnknown() { - return confidence == LanguageConfidence.NONE; - } - - /** - * Return true if the target language matches the detected language. We consider - * it a match if, for the precision requested or detected, it matches. This means: - * - * target | detected | match? - * zh | en | false - * zh | zh | true - * zh | zh-CN | true - * zh-CN | zh | true - * zh-CN | zh-TW | false - * zh-CN | zh-cn | true (case-insensitive) - * - * @param language - * @return - */ - public boolean isLanguage(String language) { - String[] targetLanguage = language.split("\\-"); - String[] resultLanguage = this.language.split("\\-"); - - int minLength = Math.min(targetLanguage.length, resultLanguage.length); - for (int i = 0; i < minLength; i++) { - if (!targetLanguage[i].equalsIgnoreCase(resultLanguage[i])) { - return false; - } - } - - return true; - } - - @Override - public String toString() { - return String.format(Locale.US, "%s: %s (%f)", language, confidence, rawScore); - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageWriter.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageWriter.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageWriter.java deleted file mode 100644 index 8bd47cc..0000000 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageWriter.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.langdetect; - -import java.io.IOException; -import java.io.Writer; - -/** - * Writer that builds a language profile based on all the written content. - * - * @since Apache Tika 0.10 - */ -public class LanguageWriter extends Writer { - - private final LanguageDetector detector; - - public LanguageWriter(LanguageDetector detector) { - this.detector = detector; - detector.reset(); - } - - /** - * Returns the language detector used by this writer. Note that - * the returned language detector gets updated whenever new characters - * are written. - * - * @return language detector - */ - public LanguageDetector getDetector() { - return detector; - } - - /** - * Returns the detected language based on text written thus far. - * - * @return LanguageResult - */ - public LanguageResult getLanguage() { - return detector.detect(); - } - - @Override - public void write(char[] cbuf, int off, int len) { - detector.addText(cbuf, off, len); - } - - /** - * Ignored. - */ - @Override - public void close() throws IOException { - } - - /** - * Ignored. - */ - @Override - public void flush() { - } - - public void reset() { - detector.reset(); - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java index 4bd8a21..7461df7 100644 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java +++ b/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java @@ -9,6 +9,11 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.tika.language.detect.LanguageConfidence; +import org.apache.tika.language.detect.LanguageDetector; +import org.apache.tika.language.detect.LanguageNames; +import org.apache.tika.language.detect.LanguageResult; + import com.optimaize.langdetect.DetectedLanguage; import com.optimaize.langdetect.LanguageDetectorBuilder; import com.optimaize.langdetect.i18n.LdLocale; http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageDetectorTest.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageDetectorTest.java b/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageDetectorTest.java index a79e83f..7bc2873 100644 --- a/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageDetectorTest.java +++ b/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageDetectorTest.java @@ -17,7 +17,7 @@ public abstract class LanguageDetectorTest { protected String[] getTestLanguages() throws IOException { List<String> result = new ArrayList<>(); - List<String> lines = IOUtils.readLines(LanguageDetector.class.getResourceAsStream("language-codes.txt")); + List<String> lines = IOUtils.readLines(LanguageDetectorTest.class.getResourceAsStream("language-codes.txt")); for (String line : lines) { line = line.trim(); if (line.isEmpty() || line.startsWith("#")) { http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageNamesTest.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageNamesTest.java b/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageNamesTest.java deleted file mode 100644 index 8ff8fd2..0000000 --- a/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageNamesTest.java +++ /dev/null @@ -1,22 +0,0 @@ -package org.apache.tika.langdetect; - -import static org.junit.Assert.*; - -import org.junit.Test; - -public class LanguageNamesTest { - - @Test - public void test() { - - // macro language + language == language - String languageA = LanguageNames.normalizeName("zh-yue"); - String languageB = LanguageNames.normalizeName("yue"); - assertTrue(LanguageNames.equals(languageA, languageB)); - - // TODO verify that "en-Latn" == "en" - - // TODO verify that "en-GB" == "en"??? - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java b/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java index 3997cdf..097cfe1 100644 --- a/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java +++ b/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java @@ -13,6 +13,10 @@ import java.util.Locale; import java.util.Map; import org.apache.tika.io.IOUtils; +import org.apache.tika.language.detect.LanguageConfidence; +import org.apache.tika.language.detect.LanguageDetector; +import org.apache.tika.language.detect.LanguageResult; +import org.apache.tika.language.detect.LanguageWriter; import org.junit.Test; public class OptimaizeLangDetectorTest extends LanguageDetectorTest { http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java index 847c101..4eaab91 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java @@ -31,8 +31,8 @@ import javax.ws.rs.Produces; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.tika.langdetect.LanguageResult; import org.apache.tika.langdetect.OptimaizeLangDetector; +import org.apache.tika.language.detect.LanguageResult; @Path("/language") public class LanguageResource { http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java index 1fb8385..89d35e8 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java @@ -35,7 +35,7 @@ import javax.ws.rs.core.UriInfo; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.cxf.jaxrs.ext.multipart.Attachment; -import org.apache.tika.langdetect.LanguageHandler; +import org.apache.tika.language.detect.LanguageHandler; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java index 57443b5..aa4e0ab 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java @@ -34,7 +34,7 @@ import javax.ws.rs.core.UriInfo; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.cxf.jaxrs.ext.multipart.Attachment; -import org.apache.tika.langdetect.LanguageHandler; +import org.apache.tika.language.detect.LanguageHandler; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java ---------------------------------------------------------------------- diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java index 284bb5b..fb8df65 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java @@ -36,9 +36,8 @@ import org.apache.commons.logging.LogFactory; import org.apache.tika.config.LoadErrorHandler; import org.apache.tika.config.ServiceLoader; import org.apache.tika.exception.TikaException; -import org.apache.tika.langdetect.LanguageConfidence; -import org.apache.tika.langdetect.LanguageResult; import org.apache.tika.langdetect.OptimaizeLangDetector; +import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.language.translate.Translator; @Path("/translate") http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java ---------------------------------------------------------------------- diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java index 2ff140e..d892ab9 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java @@ -2,9 +2,9 @@ package org.apache.tika.language.translate; import java.io.IOException; -import org.apache.tika.langdetect.LanguageDetector; -import org.apache.tika.langdetect.LanguageResult; import org.apache.tika.langdetect.OptimaizeLangDetector; +import org.apache.tika.language.detect.LanguageDetector; +import org.apache.tika.language.detect.LanguageResult; public abstract class AbstractTranslator implements Translator { http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java ---------------------------------------------------------------------- diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java index f175681..f2011be 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java @@ -21,7 +21,7 @@ import java.io.IOException; import java.util.HashMap; import org.apache.tika.exception.TikaException; -import org.apache.tika.langdetect.LanguageResult; +import org.apache.tika.language.detect.LanguageResult; import com.fasterxml.jackson.databind.util.LRUMap;
