[18/34] tika git commit: Move base lang detect classes to core

mattmann Fri, 22 Apr 2016 15:23:50 -0700

Move base lang detect classes to core


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f9113be5
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f9113be5
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f9113be5

Branch: refs/heads/master
Commit: f9113be57db9607c0e3710b8560989e0f8f8efef
Parents: 3a7a94c
Author: Ken Krugler <[email protected]>
Authored: Wed Feb 24 15:28:16 2016 -0800
Committer: trevorlewis <[email protected]>
Committed: Mon Mar 7 11:44:08 2016 -0800

----------------------------------------------------------------------
 .../main/java/org/apache/tika/cli/TikaCLI.java  |   2 +-
 .../language/detect/LanguageConfidence.java     |   9 +
 .../tika/language/detect/LanguageDetector.java  | 224 +++++++++++++++++++
 .../tika/language/detect/LanguageHandler.java   |  66 ++++++
 .../tika/language/detect/LanguageNames.java     |  70 ++++++
 .../tika/language/detect/LanguageResult.java    |  82 +++++++
 .../tika/language/detect/LanguageWriter.java    |  78 +++++++
 .../tika/language/detect/LanguageNamesTest.java |  22 ++
 .../java/org/apache/tika/example/Language.java  |   8 +-
 .../tika/example/LanguageDetectingParser.java   |   5 +-
 .../tika/example/LanguageDetectorExample.java   |   4 +-
 .../org/apache/tika/example/MyFirstTika.java    |   5 +-
 .../tika/langdetect/LanguageConfidence.java     |   9 -
 .../tika/langdetect/LanguageDetector.java       | 183 ---------------
 .../apache/tika/langdetect/LanguageHandler.java |  66 ------
 .../apache/tika/langdetect/LanguageNames.java   |  70 ------
 .../apache/tika/langdetect/LanguageResult.java  |  82 -------
 .../apache/tika/langdetect/LanguageWriter.java  |  78 -------
 .../tika/langdetect/OptimaizeLangDetector.java  |   5 +
 .../tika/langdetect/LanguageDetectorTest.java   |   2 +-
 .../tika/langdetect/LanguageNamesTest.java      |  22 --
 .../langdetect/OptimaizeLangDetectorTest.java   |   4 +
 .../tika/server/resource/LanguageResource.java  |   2 +-
 .../tika/server/resource/MetadataResource.java  |   2 +-
 .../resource/RecursiveMetadataResource.java     |   2 +-
 .../tika/server/resource/TranslateResource.java |   3 +-
 .../language/translate/AbstractTranslator.java  |   4 +-
 .../language/translate/CachedTranslator.java    |   2 +-
 28 files changed, 579 insertions(+), 532 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
----------------------------------------------------------------------
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 3efe0f7..ef8045e 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -83,7 +83,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.fork.ForkParser;
 import org.apache.tika.gui.TikaGUI;
 import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.langdetect.LanguageHandler;
+import org.apache.tika.language.detect.LanguageHandler;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.serialization.JsonMetadata;
 import org.apache.tika.metadata.serialization.JsonMetadataList;

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java
 
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java
new file mode 100644
index 0000000..fcd4485
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java
@@ -0,0 +1,9 @@
+package org.apache.tika.language.detect;
+
+public enum LanguageConfidence {
+
+       HIGH,
+       MEDIUM,
+       LOW,
+       NONE            // Special value when no language is detected
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java 
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java
new file mode 100644
index 0000000..ee6100d
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java
@@ -0,0 +1,224 @@
+package org.apache.tika.language.detect;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.language.translate.Translator;
+
+// We should use the IANA registry for primary language names...see
+// 
http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
+// There must be a package that uses this dataset to support knowledge of
+// the default script, etc. And how to map from <lang>-<country> (e.g. 'zh-CN')
+// to <sublang> ('cmn'), or <lang>-<sublang> to <sublan> ('zh-cmn' => 'cmn')
+// We'd also want to know the default sublang for a macro language ('zh' => 
'zh-cmn')
+// There's also mapping 'zh-CN' to 'cmn-Hans' (simplified chinese script)
+
+// TODO decide how deep to go into supporting extended language tags, see
+// http://www.w3.org/International/articles/language-tags/. For example,
+// what should you expect from calling hasModel("en-GB") if there's only
+// a model for "en"?
+
+// This is mostly an issue for interpreting language tags in (X)HTML docs,
+// and maybe XML if we really care. In those cases you could get something
+// like "ast" (three letter language code), or even zh-cmn-Hant-SG
+// (Chinese, Mandarin, Traditional script, in Singapore) plus additional:
+// language-extlang-script-region-variant-extension-privateuse
+
+// The full spec is at http://www.rfc-editor.org/rfc/bcp/bcp47.txt
+
+public abstract class LanguageDetector {
+
+       private static final ServiceLoader DEFAULT_SERVICE_LOADER = new 
ServiceLoader();
+
+       // True if text is expected to be a mix of languages, and thus 
higher-resolution
+       // detection must be done to avoid under-sampling the text.
+       protected boolean mixedLanguages = false;
+       
+       // True if the text is expected to be 'short' (typically less than 100 
chars), and
+       // thus a different algorithm and/or set of profiles should be used.
+       protected boolean shortText = false;
+       
+       public static LanguageDetector getDefaultLanguageDetector() {
+               List<LanguageDetector> detectors = getLanguageDetectors();
+               if (detectors.isEmpty()) {
+                       throw new IllegalStateException("No language detectors 
available");
+               } else {
+                       return detectors.get(0);
+               }
+       }
+       
+       public static List<LanguageDetector> getLanguageDetectors() {
+               return getLanguageDetectors(DEFAULT_SERVICE_LOADER);
+       }
+       
+       public static List<LanguageDetector> getLanguageDetectors(ServiceLoader 
loader) {
+        List<LanguageDetector> detectors = 
loader.loadStaticServiceProviders(LanguageDetector.class);
+        Collections.sort(detectors, new Comparator<LanguageDetector>() {
+            public int compare(LanguageDetector d1, LanguageDetector d2) {
+                String n1 = d1.getClass().getName();
+                String n2 = d2.getClass().getName();
+                boolean tika1 = n1.startsWith("org.apache.tika.");
+                boolean tika2 = n2.startsWith("org.apache.tika.");
+                if (tika1 == tika2) {
+                    return n1.compareTo(n2);
+                } else if (tika1) {
+                    return -1;
+                } else {
+                    return 1;
+                }
+            }
+        });
+        
+        return detectors;
+       }
+       
+       public boolean isMixedLanguages() {
+               return mixedLanguages;
+       }
+       
+       public LanguageDetector setMixedLanguages(boolean mixedLanguages) {
+               this.mixedLanguages = mixedLanguages;
+               return this;
+       }
+       
+       public boolean isShortText() {
+               return shortText;
+       }
+       
+       public LanguageDetector setShortText(boolean shortText) {
+               this.shortText = shortText;
+               return this;
+       }
+       
+       /**
+        * Load (or re-load) all available language models. This must
+        * be called after any settings that would impact the models
+        * being loaded (e.g. mixed language/short text), but
+        * before any of the document processing routines (below)
+        * are called. Note that it only needs to be called once.
+        * 
+        * @return this
+        */
+       public abstract LanguageDetector loadModels() throws IOException;
+       
+       /**
+        * Load (or re-load) the models specified in <languages>. These use the
+        * ISO 639-1 names, with an optional "-<country code>" for more
+        * specific specification (e.g. "zh-CN" for Chinese in China).
+        * 
+        * @param languages list of target languages.
+        * @return this
+        */
+       public abstract LanguageDetector loadModels(Set<String> languages) 
throws IOException;
+       
+       /**
+        * Provide information about whether a model exists for a specific
+        * language.
+        * 
+        * @param language ISO 639-1 name for language
+        * @return true if a model for this language exists.
+        */
+       public abstract boolean hasModel(String language);
+       
+       /**
+        * Set the a-priori probabilities for these languages. The provided map 
uses the language
+        * as the key, and the probability (0.0 > probability < 1.0) of text 
being in that language.
+        * Note that if the probabilities don't sum to 1.0, these values will 
be normalized.
+        * 
+        * If hasModel() returns false for any of the languages, an 
IllegalArgumentException is thrown.
+        * 
+        * Use of these probabilities is detector-specific, and thus might not 
impact the results at all.
+        * As such, these should be viewed as a hint.
+        * 
+        * @param languageProbabilities Map from language to probability
+        * @return this
+        */
+       public abstract LanguageDetector setPriors(Map<String, Float> 
languageProbabilities) throws IOException;
+       
+       // ============================================================
+       // The routines below are called when processing a document
+       // ============================================================
+
+       /**
+        * Reset statistics about the current document being processed
+        */
+       public abstract void reset();
+       
+       /**
+        * Add statistics about this text for the current document. Note
+        * that we assume an implicit word break exists before/after
+        * each of these runs of text.
+        * 
+        * @param cbuf Character buffer
+        * @param off Offset into cbuf to first character in the run of text
+        * @param len Number of characters in the run of text.
+        */
+       public abstract void addText(char[] cbuf, int off, int len);
+       
+       /**
+        * Add <text> to the statistics being accumulated for the current
+        * document. Note that this is a default implementation for adding
+        * a string (not optimized)
+        * 
+        * @param text Characters to add to current statistics.
+        */
+       public void addText(CharSequence text) {
+               char[] chars = text.toString().toCharArray();
+               addText(chars, 0, chars.length);
+       }
+
+       
+       /**
+        * Tell the caller whether more text is required for the current 
document
+        * before the language can be reliably detected.
+        * 
+        * Implementations can override this to do early termination of stats
+        * collection, which can improve performance with longer documents.
+        * 
+        * Note that detect() can be called even when this returns false
+        * 
+        * @return true if we have enough text for reliable detection.
+        */
+       public boolean hasEnoughText() {
+               return false;
+       }
+       
+       /**
+        * Detect languages based on previously submitted text (via addText 
calls).
+        * 
+        * @return      list of all possible languages with at least medium 
confidence,
+        *                      sorted by confidence from highest to lowest. 
There will always
+        *                      be at least one result, which might have a 
confidence of NONE.
+        */
+       public abstract List<LanguageResult> detectAll();
+       
+       public LanguageResult detect() {
+               List<LanguageResult> results = detectAll();
+               return results.get(0);
+       }
+
+       /**
+        * Utility wrapper that detects the language of a given chunk of text.
+        * 
+        * @param text String to add to current statistics.
+        * @return list of all possible languages with at least medium 
confidence,
+        *                      sorted by confidence from highest to lowest.
+        */
+       public List<LanguageResult> detectAll(String text) {
+               reset();
+               addText(text);
+               return detectAll();
+       }
+       
+       public LanguageResult detect(CharSequence text) {
+               reset();
+               addText(text);
+               return detect();
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java 
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java
new file mode 100644
index 0000000..673b4db
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.language.detect;
+
+import java.io.IOException;
+
+import org.apache.tika.sax.WriteOutContentHandler;
+
+/**
+ * SAX content handler that updates a language detector based on all the
+ * received character content.
+ *
+ * @since Apache Tika 0.10
+ */
+public class LanguageHandler extends WriteOutContentHandler {
+
+    private final LanguageWriter writer;
+
+    public LanguageHandler() throws IOException {
+       this(new 
LanguageWriter(LanguageDetector.getDefaultLanguageDetector().loadModels()));
+    }
+    
+    public LanguageHandler(LanguageWriter writer) {
+        super(writer);
+        
+        this.writer = writer;
+    }
+
+    public LanguageHandler(LanguageDetector detector) {
+        this(new LanguageWriter(detector));
+    }
+
+    /**
+     * Returns the language detector used by this content handler.
+     * Note that the returned detector gets updated whenever new SAX events
+     * are received by this content handler.
+     *
+     * @return language detector
+     */
+    public LanguageDetector getDetector() {
+        return writer.getDetector();
+    }
+
+    /**
+     * Returns the detected language based on text handled thus far.
+     * 
+     * @return LanguageResult
+     */
+    public LanguageResult getLanguage() {
+       return writer.getLanguage();
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java 
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java
new file mode 100644
index 0000000..d659753
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java
@@ -0,0 +1,70 @@
+package org.apache.tika.language.detect;
+
+import java.util.Locale;
+
+/**
+ * Support for language tags (as defined by https://tools.ietf.org/html/bcp47)
+ * 
+ * See https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes for a list of
+ * three character language codes.
+ * 
+ * TODO change to LanguageTag, and use these vs. strings everywhere in the
+ * language detector API?
+ *
+ */
+public class LanguageNames {
+
+       public static String makeName(String language, String script, String 
region) {
+               Locale locale = new 
Locale.Builder().setLanguage(language).setScript(script).setRegion(region).build();
+               return locale.toLanguageTag();
+       }
+
+       public static String normalizeName(String languageTag) {
+               Locale locale = Locale.forLanguageTag(languageTag);
+               return locale.toLanguageTag();
+       }
+       
+       public static boolean isMacroLanguage(String languageTag) {
+               Locale locale = Locale.forLanguageTag(languageTag);
+               // TODO make it so.
+               return false;
+       }
+       
+       public static boolean hasMacroLanguage(String languageTag) {
+               Locale locale = Locale.forLanguageTag(languageTag);
+               // TODO make it so
+               return false;
+       }
+       
+       /**
+        * If language is a specific variant of a macro language (e.g. 'nb' for 
Norwegian Bokmal),
+        * return the macro language (e.g. 'no' for Norwegian). If it doesn't 
have a macro language,
+        * return unchanged.
+        * 
+        * @param languageTag
+        * @return
+        */
+       public static String getMacroLanguage(String languageTag) {
+               // TODO make it so
+               return languageTag;
+       }
+       
+       public static boolean equals(String languageTagA, String languageTagB) {
+               Locale localeA = Locale.forLanguageTag(languageTagA);
+               Locale localeB = Locale.forLanguageTag(languageTagB);
+               
+               // TODO Fill in script if missing and something we could derive 
from lang+region
+               // e.g. zh-CN => zh-Hans-CN, zh-TW => zh-Hant-TW.
+               
+               // TODO Treat missing script == present script, if present 
script is default (suppressed) for
+               // the language. So "en-Latn" == "en"
+               
+               // TODO probably OK to ignore extensions
+               
+               // TODO Do we want/need a fuzzy match for region (and script)
+               // E.g. are 'en' and 'en-GB' equal? Depends on the direction, 
e.g. if you want 'en', and
+               // you get back something more specific (en-GB) then that's OK, 
but if you explicitly want
+               // en-GB and you get back en then that might not be OK.
+               return localeA.equals(localeB);
+       }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java 
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
new file mode 100644
index 0000000..6952ae9
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
@@ -0,0 +1,82 @@
+package org.apache.tika.language.detect;
+
+import java.util.Locale;
+
+public class LanguageResult {
+
+       // A result that indicates no match. Used when no language was detected.
+       public static final LanguageResult NULL = new LanguageResult("", 
LanguageConfidence.NONE, 0.0f);
+       
+       private String language;
+       
+       private LanguageConfidence confidence;
+       
+       // rawScore should be a number from 0.0 to 1.0, with higher values 
implying
+       // greater confidence.
+       private float rawScore;
+       
+       /**
+        * 
+        * @param language ISO 639-1 language code (plus optional "-<country 
code>")
+        * @param rawScore confidence of detector in the result.
+        */
+       public LanguageResult(String language, LanguageConfidence confidence, 
float rawScore) {
+               this.language = language;
+               this.confidence = confidence;
+               this.rawScore = rawScore;
+       }
+
+       public String getLanguage() {
+               return language;
+       }
+
+       public float getRawScore() {
+               return rawScore;
+       }
+       
+       public LanguageConfidence getConfidence() {
+               return confidence;
+       }
+       
+       public boolean isReasonablyCertain() {
+               return confidence == LanguageConfidence.HIGH;
+       }
+       
+       public boolean isUnknown() {
+               return confidence == LanguageConfidence.NONE;
+       }
+       
+       /**
+        * Return true if the target language matches the detected language. We 
consider
+        * it a match if, for the precision requested or detected, it matches. 
This means:
+        * 
+        * target       |       detected        | match?
+        * zh           |       en                      | false
+        * zh           |       zh                      | true
+        * zh           |       zh-CN           | true
+        * zh-CN        |       zh                      | true
+        * zh-CN        |       zh-TW           | false
+        * zh-CN        |       zh-cn           | true (case-insensitive)
+        * 
+        * @param language
+        * @return
+        */
+       public boolean isLanguage(String language) {
+               String[] targetLanguage = language.split("\\-");
+               String[] resultLanguage = this.language.split("\\-");
+               
+               int minLength = Math.min(targetLanguage.length, 
resultLanguage.length);
+               for (int i = 0; i < minLength; i++) {
+                       if 
(!targetLanguage[i].equalsIgnoreCase(resultLanguage[i])) {
+                               return false;
+                       }
+               }
+               
+               return true;
+       }
+       
+       @Override
+       public String toString() {
+               return String.format(Locale.US, "%s: %s (%f)", language, 
confidence, rawScore);
+       }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java 
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java
new file mode 100644
index 0000000..7630990
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.language.detect;
+
+import java.io.IOException;
+import java.io.Writer;
+
+/**
+ * Writer that builds a language profile based on all the written content.
+ *
+ * @since Apache Tika 0.10
+ */
+public class LanguageWriter extends Writer {
+
+    private final LanguageDetector detector;
+
+    public LanguageWriter(LanguageDetector detector) {
+        this.detector = detector;
+        detector.reset();
+    }
+
+    /**
+     * Returns the language detector used by this writer. Note that
+     * the returned language detector gets updated whenever new characters
+     * are written.
+     *
+     * @return language detector
+     */
+    public LanguageDetector getDetector() {
+        return detector;
+    }
+
+    /**
+     * Returns the detected language based on text written thus far.
+     * 
+     * @return LanguageResult
+     */
+    public LanguageResult getLanguage() {
+       return detector.detect();
+    }
+    
+    @Override
+    public void write(char[] cbuf, int off, int len) {
+       detector.addText(cbuf, off, len);
+    }
+
+    /**
+     * Ignored.
+     */
+    @Override
+    public void close() throws IOException {
+    }
+
+    /**
+     * Ignored.
+     */
+    @Override
+    public void flush() {
+    }
+
+    public void reset() {
+       detector.reset();
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java
 
b/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java
new file mode 100644
index 0000000..4951670
--- /dev/null
+++ 
b/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java
@@ -0,0 +1,22 @@
+package org.apache.tika.language.detect;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+public class LanguageNamesTest {
+
+       @Test
+       public void test() {
+               
+               // macro language + language == language
+               String languageA = LanguageNames.normalizeName("zh-yue");
+               String languageB = LanguageNames.normalizeName("yue");
+               assertTrue(LanguageNames.equals(languageA, languageB));
+               
+               // TODO verify that "en-Latn" == "en"
+               
+               // TODO verify that "en-GB" == "en"???
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-example/src/main/java/org/apache/tika/example/Language.java
----------------------------------------------------------------------
diff --git a/tika-example/src/main/java/org/apache/tika/example/Language.java 
b/tika-example/src/main/java/org/apache/tika/example/Language.java
index 42dc608..ec14a58 100755
--- a/tika-example/src/main/java/org/apache/tika/example/Language.java
+++ b/tika-example/src/main/java/org/apache/tika/example/Language.java
@@ -19,11 +19,11 @@ package org.apache.tika.example;
 
 import java.io.IOException;
 
-import org.apache.tika.langdetect.LanguageDetector;
-import org.apache.tika.langdetect.LanguageHandler;
-import org.apache.tika.langdetect.LanguageResult;
-import org.apache.tika.langdetect.LanguageWriter;
 import org.apache.tika.langdetect.OptimaizeLangDetector;
+import org.apache.tika.language.detect.LanguageDetector;
+import org.apache.tika.language.detect.LanguageHandler;
+import org.apache.tika.language.detect.LanguageResult;
+import org.apache.tika.language.detect.LanguageWriter;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java
----------------------------------------------------------------------
diff --git 
a/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java
 
b/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java
index 9b67bd7..0ba8a6c 100755
--- 
a/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java
+++ 
b/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java
@@ -21,8 +21,8 @@ import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.langdetect.LanguageHandler;
-import org.apache.tika.langdetect.LanguageResult;
+import org.apache.tika.language.detect.LanguageHandler;
+import org.apache.tika.language.detect.LanguageResult;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.DelegatingParser;
@@ -31,7 +31,6 @@ import org.apache.tika.sax.TeeContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-@SuppressWarnings("deprecation")
 public class LanguageDetectingParser extends DelegatingParser {
     private static final long serialVersionUID = 4291320409396502774L;
 

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java
----------------------------------------------------------------------
diff --git 
a/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java
 
b/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java
index d37208f..53e5c7a 100644
--- 
a/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java
+++ 
b/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java
@@ -19,9 +19,9 @@ package org.apache.tika.example;
 
 import java.io.IOException;
 
-import org.apache.tika.langdetect.LanguageDetector;
-import org.apache.tika.langdetect.LanguageResult;
 import org.apache.tika.langdetect.OptimaizeLangDetector;
+import org.apache.tika.language.detect.LanguageDetector;
+import org.apache.tika.language.detect.LanguageResult;
 
 public class LanguageDetectorExample {
        

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
----------------------------------------------------------------------
diff --git 
a/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java 
b/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
index 76ec039..fe0c8d9 100755
--- a/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
+++ b/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
@@ -26,10 +26,9 @@ import org.apache.commons.io.FileUtils;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.langdetect.LanguageDetector;
-import org.apache.tika.langdetect.LanguageResult;
-import org.apache.tika.langdetect.LanguageWriter;
 import org.apache.tika.langdetect.OptimaizeLangDetector;
+import org.apache.tika.language.detect.LanguageDetector;
+import org.apache.tika.language.detect.LanguageResult;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MimeTypes;

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageConfidence.java
----------------------------------------------------------------------
diff --git 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageConfidence.java
 
b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageConfidence.java
deleted file mode 100644
index af65d40..0000000
--- 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageConfidence.java
+++ /dev/null
@@ -1,9 +0,0 @@
-package org.apache.tika.langdetect;
-
-public enum LanguageConfidence {
-
-       HIGH,
-       MEDIUM,
-       LOW,
-       NONE            // Special value when no language is detected
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageDetector.java
----------------------------------------------------------------------
diff --git 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageDetector.java
 
b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageDetector.java
deleted file mode 100644
index e97581a..0000000
--- 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageDetector.java
+++ /dev/null
@@ -1,183 +0,0 @@
-package org.apache.tika.langdetect;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-// We should use the IANA registry for primary language names...see
-// 
http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
-// There must be a package that uses this dataset to support knowledge of
-// the default script, etc. And how to map from <lang>-<country> (e.g. 'zh-CN')
-// to <sublang> ('cmn'), or <lang>-<sublang> to <sublan> ('zh-cmn' => 'cmn')
-// We'd also want to know the default sublang for a macro language ('zh' => 
'zh-cmn')
-// There's also mapping 'zh-CN' to 'cmn-Hans' (simplified chinese script)
-
-// TODO decide how deep to go into supporting extended language tags, see
-// http://www.w3.org/International/articles/language-tags/. For example,
-// what should you expect from calling hasModel("en-GB") if there's only
-// a model for "en"?
-
-// This is mostly an issue for interpreting language tags in (X)HTML docs,
-// and maybe XML if we really care. In those cases you could get something
-// like "ast" (three letter language code), or even zh-cmn-Hant-SG
-// (Chinese, Mandarin, Traditional script, in Singapore) plus additional:
-// language-extlang-script-region-variant-extension-privateuse
-
-// The full spec is at http://www.rfc-editor.org/rfc/bcp/bcp47.txt
-
-public abstract class LanguageDetector {
-
-       // True if text is expected to be a mix of languages, and thus 
higher-resolution
-       // detection must be done to avoid under-sampling the text.
-       protected boolean mixedLanguages = false;
-       
-       // True if the text is expected to be 'short' (typically less than 100 
chars), and
-       // thus a different algorithm and/or set of profiles should be used.
-       protected boolean shortText = false;
-       
-       public boolean isMixedLanguages() {
-               return mixedLanguages;
-       }
-       
-       public LanguageDetector setMixedLanguages(boolean mixedLanguages) {
-               this.mixedLanguages = mixedLanguages;
-               return this;
-       }
-       
-       public boolean isShortText() {
-               return shortText;
-       }
-       
-       public LanguageDetector setShortText(boolean shortText) {
-               this.shortText = shortText;
-               return this;
-       }
-       
-       /**
-        * Load (or re-load) all available language models. This must
-        * be called after any settings that would impact the models
-        * being loaded (e.g. mixed language/short text), but
-        * before any of the document processing routines (below)
-        * are called. Note that it only needs to be called once.
-        * 
-        * @return this
-        */
-       public abstract LanguageDetector loadModels() throws IOException;
-       
-       /**
-        * Load (or re-load) the models specified in <languages>. These use the
-        * ISO 639-1 names, with an optional "-<country code>" for more
-        * specific specification (e.g. "zh-CN" for Chinese in China).
-        * 
-        * @param languages list of target languages.
-        * @return this
-        */
-       public abstract LanguageDetector loadModels(Set<String> languages) 
throws IOException;
-       
-       /**
-        * Provide information about whether a model exists for a specific
-        * language.
-        * 
-        * @param language ISO 639-1 name for language
-        * @return true if a model for this language exists.
-        */
-       public abstract boolean hasModel(String language);
-       
-       /**
-        * Set the a-priori probabilities for these languages. The provided map 
uses the language
-        * as the key, and the probability (0.0 > probability < 1.0) of text 
being in that language.
-        * Note that if the probabilities don't sum to 1.0, these values will 
be normalized.
-        * 
-        * If hasModel() returns false for any of the languages, an 
IllegalArgumentException is thrown.
-        * 
-        * Use of these probabilities is detector-specific, and thus might not 
impact the results at all.
-        * As such, these should be viewed as a hint.
-        * 
-        * @param languageProbabilities Map from language to probability
-        * @return this
-        */
-       public abstract LanguageDetector setPriors(Map<String, Float> 
languageProbabilities) throws IOException;
-       
-       // ============================================================
-       // The routines below are called when processing a document
-       // ============================================================
-
-       /**
-        * Reset statistics about the current document being processed
-        */
-       public abstract void reset();
-       
-       /**
-        * Add statistics about this text for the current document. Note
-        * that we assume an implicit word break exists before/after
-        * each of these runs of text.
-        * 
-        * @param cbuf Character buffer
-        * @param off Offset into cbuf to first character in the run of text
-        * @param len Number of characters in the run of text.
-        */
-       public abstract void addText(char[] cbuf, int off, int len);
-       
-       /**
-        * Add <text> to the statistics being accumulated for the current
-        * document. Note that this is a default implementation for adding
-        * a string (not optimized)
-        * 
-        * @param text Characters to add to current statistics.
-        */
-       public void addText(CharSequence text) {
-               char[] chars = text.toString().toCharArray();
-               addText(chars, 0, chars.length);
-       }
-
-       
-       /**
-        * Tell the caller whether more text is required for the current 
document
-        * before the language can be reliably detected.
-        * 
-        * Implementations can override this to do early termination of stats
-        * collection, which can improve performance with longer documents.
-        * 
-        * Note that detect() can be called even when this returns false
-        * 
-        * @return true if we have enough text for reliable detection.
-        */
-       public boolean hasEnoughText() {
-               return false;
-       }
-       
-       /**
-        * Detect languages based on previously submitted text (via addText 
calls).
-        * 
-        * @return      list of all possible languages with at least medium 
confidence,
-        *                      sorted by confidence from highest to lowest. 
There will always
-        *                      be at least one result, which might have a 
confidence of NONE.
-        */
-       public abstract List<LanguageResult> detectAll();
-       
-       public LanguageResult detect() {
-               List<LanguageResult> results = detectAll();
-               return results.get(0);
-       }
-
-       /**
-        * Utility wrapper that detects the language of a given chunk of text.
-        * 
-        * @param text String to add to current statistics.
-        * @return list of all possible languages with at least medium 
confidence,
-        *                      sorted by confidence from highest to lowest.
-        */
-       public List<LanguageResult> detectAll(String text) {
-               reset();
-               addText(text);
-               return detectAll();
-       }
-       
-       public LanguageResult detect(CharSequence text) {
-               reset();
-               addText(text);
-               return detect();
-       }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageHandler.java 
b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageHandler.java
deleted file mode 100644
index 631e1ee..0000000
--- 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageHandler.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.langdetect;
-
-import java.io.IOException;
-
-import org.apache.tika.sax.WriteOutContentHandler;
-
-/**
- * SAX content handler that updates a language detector based on all the
- * received character content.
- *
- * @since Apache Tika 0.10
- */
-public class LanguageHandler extends WriteOutContentHandler {
-
-    private final LanguageWriter writer;
-
-    public LanguageHandler() throws IOException {
-       this(new LanguageWriter(new OptimaizeLangDetector().loadModels()));
-    }
-    
-    public LanguageHandler(LanguageWriter writer) {
-        super(writer);
-        
-        this.writer = writer;
-    }
-
-    public LanguageHandler(LanguageDetector detector) {
-        this(new LanguageWriter(detector));
-    }
-
-    /**
-     * Returns the language detector used by this content handler.
-     * Note that the returned detector gets updated whenever new SAX events
-     * are received by this content handler.
-     *
-     * @return language detector
-     */
-    public LanguageDetector getDetector() {
-        return writer.getDetector();
-    }
-
-    /**
-     * Returns the detected language based on text handled thus far.
-     * 
-     * @return LanguageResult
-     */
-    public LanguageResult getLanguage() {
-       return writer.getLanguage();
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageNames.java
----------------------------------------------------------------------
diff --git 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageNames.java 
b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageNames.java
deleted file mode 100644
index abed277..0000000
--- 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageNames.java
+++ /dev/null
@@ -1,70 +0,0 @@
-package org.apache.tika.langdetect;
-
-import java.util.Locale;
-
-/**
- * Support for language tags (as defined by https://tools.ietf.org/html/bcp47)
- * 
- * See https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes for a list of
- * three character language codes.
- * 
- * TODO change to LanguageTag, and use these vs. strings everywhere in the
- * language detector API?
- *
- */
-public class LanguageNames {
-
-       public static String makeName(String language, String script, String 
region) {
-               Locale locale = new 
Locale.Builder().setLanguage(language).setScript(script).setRegion(region).build();
-               return locale.toLanguageTag();
-       }
-
-       public static String normalizeName(String languageTag) {
-               Locale locale = Locale.forLanguageTag(languageTag);
-               return locale.toLanguageTag();
-       }
-       
-       public static boolean isMacroLanguage(String languageTag) {
-               Locale locale = Locale.forLanguageTag(languageTag);
-               // TODO make it so.
-               return false;
-       }
-       
-       public static boolean hasMacroLanguage(String languageTag) {
-               Locale locale = Locale.forLanguageTag(languageTag);
-               // TODO make it so
-               return false;
-       }
-       
-       /**
-        * If language is a specific variant of a macro language (e.g. 'nb' for 
Norwegian Bokmal),
-        * return the macro language (e.g. 'no' for Norwegian). If it doesn't 
have a macro language,
-        * return unchanged.
-        * 
-        * @param languageTag
-        * @return
-        */
-       public static String getMacroLanguage(String languageTag) {
-               // TODO make it so
-               return languageTag;
-       }
-       
-       public static boolean equals(String languageTagA, String languageTagB) {
-               Locale localeA = Locale.forLanguageTag(languageTagA);
-               Locale localeB = Locale.forLanguageTag(languageTagB);
-               
-               // TODO Fill in script if missing and something we could derive 
from lang+region
-               // e.g. zh-CN => zh-Hans-CN, zh-TW => zh-Hant-TW.
-               
-               // TODO Treat missing script == present script, if present 
script is default (suppressed) for
-               // the language. So "en-Latn" == "en"
-               
-               // TODO probably OK to ignore extensions
-               
-               // TODO Do we want/need a fuzzy match for region (and script)
-               // E.g. are 'en' and 'en-GB' equal? Depends on the direction, 
e.g. if you want 'en', and
-               // you get back something more specific (en-GB) then that's OK, 
but if you explicitly want
-               // en-GB and you get back en then that might not be OK.
-               return localeA.equals(localeB);
-       }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageResult.java
----------------------------------------------------------------------
diff --git 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageResult.java 
b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageResult.java
deleted file mode 100644
index 82a05c8..0000000
--- 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageResult.java
+++ /dev/null
@@ -1,82 +0,0 @@
-package org.apache.tika.langdetect;
-
-import java.util.Locale;
-
-public class LanguageResult {
-
-       // A result that indicates no match. Used when no language was detected.
-       public static final LanguageResult NULL = new LanguageResult("", 
LanguageConfidence.NONE, 0.0f);
-       
-       private String language;
-       
-       private LanguageConfidence confidence;
-       
-       // rawScore should be a number from 0.0 to 1.0, with higher values 
implying
-       // greater confidence.
-       private float rawScore;
-       
-       /**
-        * 
-        * @param language ISO 639-1 language code (plus optional "-<country 
code>")
-        * @param rawScore confidence of detector in the result.
-        */
-       public LanguageResult(String language, LanguageConfidence confidence, 
float rawScore) {
-               this.language = language;
-               this.confidence = confidence;
-               this.rawScore = rawScore;
-       }
-
-       public String getLanguage() {
-               return language;
-       }
-
-       public float getRawScore() {
-               return rawScore;
-       }
-       
-       public LanguageConfidence getConfidence() {
-               return confidence;
-       }
-       
-       public boolean isReasonablyCertain() {
-               return confidence == LanguageConfidence.HIGH;
-       }
-       
-       public boolean isUnknown() {
-               return confidence == LanguageConfidence.NONE;
-       }
-       
-       /**
-        * Return true if the target language matches the detected language. We 
consider
-        * it a match if, for the precision requested or detected, it matches. 
This means:
-        * 
-        * target       |       detected        | match?
-        * zh           |       en                      | false
-        * zh           |       zh                      | true
-        * zh           |       zh-CN           | true
-        * zh-CN        |       zh                      | true
-        * zh-CN        |       zh-TW           | false
-        * zh-CN        |       zh-cn           | true (case-insensitive)
-        * 
-        * @param language
-        * @return
-        */
-       public boolean isLanguage(String language) {
-               String[] targetLanguage = language.split("\\-");
-               String[] resultLanguage = this.language.split("\\-");
-               
-               int minLength = Math.min(targetLanguage.length, 
resultLanguage.length);
-               for (int i = 0; i < minLength; i++) {
-                       if 
(!targetLanguage[i].equalsIgnoreCase(resultLanguage[i])) {
-                               return false;
-                       }
-               }
-               
-               return true;
-       }
-       
-       @Override
-       public String toString() {
-               return String.format(Locale.US, "%s: %s (%f)", language, 
confidence, rawScore);
-       }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageWriter.java
----------------------------------------------------------------------
diff --git 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageWriter.java 
b/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageWriter.java
deleted file mode 100644
index 8bd47cc..0000000
--- 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/LanguageWriter.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.langdetect;
-
-import java.io.IOException;
-import java.io.Writer;
-
-/**
- * Writer that builds a language profile based on all the written content.
- *
- * @since Apache Tika 0.10
- */
-public class LanguageWriter extends Writer {
-
-    private final LanguageDetector detector;
-
-    public LanguageWriter(LanguageDetector detector) {
-        this.detector = detector;
-        detector.reset();
-    }
-
-    /**
-     * Returns the language detector used by this writer. Note that
-     * the returned language detector gets updated whenever new characters
-     * are written.
-     *
-     * @return language detector
-     */
-    public LanguageDetector getDetector() {
-        return detector;
-    }
-
-    /**
-     * Returns the detected language based on text written thus far.
-     * 
-     * @return LanguageResult
-     */
-    public LanguageResult getLanguage() {
-       return detector.detect();
-    }
-    
-    @Override
-    public void write(char[] cbuf, int off, int len) {
-       detector.addText(cbuf, off, len);
-    }
-
-    /**
-     * Ignored.
-     */
-    @Override
-    public void close() throws IOException {
-    }
-
-    /**
-     * Ignored.
-     */
-    @Override
-    public void flush() {
-    }
-
-    public void reset() {
-       detector.reset();
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java
----------------------------------------------------------------------
diff --git 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java
 
b/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java
index 4bd8a21..7461df7 100644
--- 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java
+++ 
b/tika-langdetect/src/main/java/org/apache/tika/langdetect/OptimaizeLangDetector.java
@@ -9,6 +9,11 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.tika.language.detect.LanguageConfidence;
+import org.apache.tika.language.detect.LanguageDetector;
+import org.apache.tika.language.detect.LanguageNames;
+import org.apache.tika.language.detect.LanguageResult;
+
 import com.optimaize.langdetect.DetectedLanguage;
 import com.optimaize.langdetect.LanguageDetectorBuilder;
 import com.optimaize.langdetect.i18n.LdLocale;

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageDetectorTest.java
----------------------------------------------------------------------
diff --git 
a/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageDetectorTest.java
 
b/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageDetectorTest.java
index a79e83f..7bc2873 100644
--- 
a/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageDetectorTest.java
+++ 
b/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageDetectorTest.java
@@ -17,7 +17,7 @@ public abstract class LanguageDetectorTest {
     protected String[] getTestLanguages() throws IOException {
        List<String> result = new ArrayList<>();
        
-       List<String> lines = 
IOUtils.readLines(LanguageDetector.class.getResourceAsStream("language-codes.txt"));
+       List<String> lines = 
IOUtils.readLines(LanguageDetectorTest.class.getResourceAsStream("language-codes.txt"));
        for (String line : lines) {
                line = line.trim();
                if (line.isEmpty() || line.startsWith("#")) {

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageNamesTest.java
----------------------------------------------------------------------
diff --git 
a/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageNamesTest.java
 
b/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageNamesTest.java
deleted file mode 100644
index 8ff8fd2..0000000
--- 
a/tika-langdetect/src/test/java/org/apache/tika/langdetect/LanguageNamesTest.java
+++ /dev/null
@@ -1,22 +0,0 @@
-package org.apache.tika.langdetect;
-
-import static org.junit.Assert.*;
-
-import org.junit.Test;
-
-public class LanguageNamesTest {
-
-       @Test
-       public void test() {
-               
-               // macro language + language == language
-               String languageA = LanguageNames.normalizeName("zh-yue");
-               String languageB = LanguageNames.normalizeName("yue");
-               assertTrue(LanguageNames.equals(languageA, languageB));
-               
-               // TODO verify that "en-Latn" == "en"
-               
-               // TODO verify that "en-GB" == "en"???
-       }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java
----------------------------------------------------------------------
diff --git 
a/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java
 
b/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java
index 3997cdf..097cfe1 100644
--- 
a/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java
+++ 
b/tika-langdetect/src/test/java/org/apache/tika/langdetect/OptimaizeLangDetectorTest.java
@@ -13,6 +13,10 @@ import java.util.Locale;
 import java.util.Map;
 
 import org.apache.tika.io.IOUtils;
+import org.apache.tika.language.detect.LanguageConfidence;
+import org.apache.tika.language.detect.LanguageDetector;
+import org.apache.tika.language.detect.LanguageResult;
+import org.apache.tika.language.detect.LanguageWriter;
 import org.junit.Test;
 
 public class OptimaizeLangDetectorTest extends LanguageDetectorTest {

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
----------------------------------------------------------------------
diff --git 
a/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
 
b/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
index 847c101..4eaab91 100644
--- 
a/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
+++ 
b/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
@@ -31,8 +31,8 @@ import javax.ws.rs.Produces;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.tika.langdetect.LanguageResult;
 import org.apache.tika.langdetect.OptimaizeLangDetector;
+import org.apache.tika.language.detect.LanguageResult;
 
 @Path("/language")
 public class LanguageResource {

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
----------------------------------------------------------------------
diff --git 
a/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
 
b/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
index 1fb8385..89d35e8 100644
--- 
a/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
+++ 
b/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
@@ -35,7 +35,7 @@ import javax.ws.rs.core.UriInfo;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
-import org.apache.tika.langdetect.LanguageHandler;
+import org.apache.tika.language.detect.LanguageHandler;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
----------------------------------------------------------------------
diff --git 
a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
 
b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
index 57443b5..aa4e0ab 100644
--- 
a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
+++ 
b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
@@ -34,7 +34,7 @@ import javax.ws.rs.core.UriInfo;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
-import org.apache.tika.langdetect.LanguageHandler;
+import org.apache.tika.language.detect.LanguageHandler;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java
----------------------------------------------------------------------
diff --git 
a/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java
 
b/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java
index 284bb5b..fb8df65 100644
--- 
a/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java
+++ 
b/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java
@@ -36,9 +36,8 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.tika.config.LoadErrorHandler;
 import org.apache.tika.config.ServiceLoader;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.langdetect.LanguageConfidence;
-import org.apache.tika.langdetect.LanguageResult;
 import org.apache.tika.langdetect.OptimaizeLangDetector;
+import org.apache.tika.language.detect.LanguageResult;
 import org.apache.tika.language.translate.Translator;
 
 @Path("/translate")

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java
----------------------------------------------------------------------
diff --git 
a/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java
 
b/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java
index 2ff140e..d892ab9 100644
--- 
a/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java
+++ 
b/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java
@@ -2,9 +2,9 @@ package org.apache.tika.language.translate;
 
 import java.io.IOException;
 
-import org.apache.tika.langdetect.LanguageDetector;
-import org.apache.tika.langdetect.LanguageResult;
 import org.apache.tika.langdetect.OptimaizeLangDetector;
+import org.apache.tika.language.detect.LanguageDetector;
+import org.apache.tika.language.detect.LanguageResult;
 
 
 public abstract class AbstractTranslator implements Translator {

http://git-wip-us.apache.org/repos/asf/tika/blob/f9113be5/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java
----------------------------------------------------------------------
diff --git 
a/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java
 
b/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java
index f175681..f2011be 100644
--- 
a/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java
+++ 
b/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java
@@ -21,7 +21,7 @@ import java.io.IOException;
 import java.util.HashMap;
 
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.langdetect.LanguageResult;
+import org.apache.tika.language.detect.LanguageResult;
 
 import com.fasterxml.jackson.databind.util.LRUMap;

[18/34] tika git commit: Move base lang detect classes to core

Reply via email to