[17/34] tika git commit: Remove built-in lang detector

mattmann Fri, 22 Apr 2016 15:23:19 -0700

Remove built-in lang detector

And update all code to use new language detector API.



Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3a7a94ca
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3a7a94ca
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3a7a94ca

Branch: refs/heads/master
Commit: 3a7a94ca5040eabd90f6060effc517126def3fc1
Parents: e38512e
Author: Ken Krugler <[email protected]>
Authored: Thu Feb 11 11:54:39 2016 -0800
Committer: trevorlewis <[email protected]>
Committed: Mon Mar 7 11:43:33 2016 -0800

----------------------------------------------------------------------
 .../main/java/org/apache/tika/cli/TikaCLI.java  |   41 +-
 .../java/org/apache/tika/cli/TikaCLITest.java   |   12 -
 .../tika/language/LanguageIdentifier.java       |  240 --
 .../apache/tika/language/LanguageProfile.java   |  314 ---
 .../tika/language/LanguageProfilerBuilder.java  |  770 ------
 .../apache/tika/language/ProfilingHandler.java  |   67 -
 .../apache/tika/language/ProfilingWriter.java   |  103 -
 .../org/apache/tika/language/package-info.java  |   22 -
 .../tika/language/LanguageIdentifierTest.java   |  183 --
 .../tika/language/LanguageProfileTest.java      |   58 -
 .../language/LanguageProfilerBuilderTest.java   |  100 -
 .../tika/language/ProfilingWriterTest.java      |   44 -
 .../resources/org/apache/tika/language/da.test  |  108 -
 .../resources/org/apache/tika/language/de.test  |  104 -
 .../resources/org/apache/tika/language/el.test  |  109 -
 .../resources/org/apache/tika/language/en.test  |  105 -
 .../resources/org/apache/tika/language/es.test  |  107 -
 .../resources/org/apache/tika/language/et.test  |   17 -
 .../resources/org/apache/tika/language/fi.test  |  106 -
 .../resources/org/apache/tika/language/fr.test  |  105 -
 .../resources/org/apache/tika/language/it.test  |  109 -
 .../tika/language/langbuilder/welsh_corpus.txt  | 2602 ------------------
 .../resources/org/apache/tika/language/lt.test  |   32 -
 .../resources/org/apache/tika/language/nl.test  |  105 -
 .../resources/org/apache/tika/language/pt.test  |  105 -
 .../resources/org/apache/tika/language/sv.test  |  108 -
 .../java/org/apache/tika/example/Language.java  |   32 +-
 .../tika/example/LanguageDetectingParser.java   |   15 +-
 .../tika/example/LanguageDetectorExample.java   |   33 +
 .../tika/example/LanguageIdentifierExample.java |   27 -
 .../org/apache/tika/example/MyFirstTika.java    |   14 +-
 .../example/LanguageDetectorExampleTest.java    |   39 +
 .../example/LanguageIdentifierExampleTest.java  |   37 -
 .../tika/langdetect/LanguageConfidence.java     |    3 +-
 .../tika/langdetect/LanguageDetector.java       |    7 +-
 .../apache/tika/langdetect/LanguageHandler.java |   14 +
 .../apache/tika/langdetect/LanguageResult.java  |    7 +
 .../apache/tika/langdetect/LanguageWriter.java  |    9 +
 .../tika/langdetect/OptimaizeLangDetector.java  |   10 +-
 .../langdetect/OptimaizeLangDetectorTest.java   |    4 +-
 .../tika/server/resource/LanguageResource.java  |   27 +-
 .../tika/server/resource/MetadataResource.java  |    9 +-
 .../resource/RecursiveMetadataResource.java     |    7 +-
 .../tika/server/resource/TranslateResource.java |   23 +-
 .../language/translate/AbstractTranslator.java  |   16 +
 .../language/translate/CachedTranslator.java    |   20 +-
 .../language/translate/ExternalTranslator.java  |   13 +-
 .../language/translate/GoogleTranslator.java    |   20 +-
 .../language/translate/Lingo24Translator.java   |   20 +-
 .../language/translate/MosesTranslator.java     |    7 +-
 50 files changed, 243 insertions(+), 5946 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
----------------------------------------------------------------------
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 4458526..3efe0f7 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -18,11 +18,6 @@ package org.apache.tika.cli;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.TransformerConfigurationException;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
@@ -57,6 +52,12 @@ import java.util.Map.Entry;
 import java.util.Set;
 import java.util.TreeSet;
 
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
 import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.CloseShieldInputStream;
@@ -82,8 +83,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.fork.ForkParser;
 import org.apache.tika.gui.TikaGUI;
 import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.language.LanguageProfilerBuilder;
-import org.apache.tika.language.ProfilingHandler;
+import org.apache.tika.langdetect.LanguageHandler;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.serialization.JsonMetadata;
 import org.apache.tika.metadata.serialization.JsonMetadataList;
@@ -283,7 +283,7 @@ public class TikaCLI {
                 OutputStream output, Metadata metadata) throws Exception {
             final PrintWriter writer =
                 new PrintWriter(getOutputWriter(output, encoding));
-            return new ProfilingHandler() {
+            return new LanguageHandler() {
                 public void endDocument() {
                     writer.println(getLanguage().getLanguage());
                     writer.flush();
@@ -305,22 +305,6 @@ public class TikaCLI {
     };
     
     
-    /* Creates ngram profile */
-    private final OutputType CREATE_PROFILE = new OutputType() {
-        @Override
-        public void process(
-                InputStream stream, OutputStream output, Metadata metadata)
-                throws Exception {
-            ngp = LanguageProfilerBuilder.create(profileName, stream, 
encoding);
-            FileOutputStream fos = new FileOutputStream(new File(profileName + 
".ngp"));
-            ngp.save(fos);//saves ngram profile
-            fos.close();
-            PrintWriter writer = new PrintWriter(getOutputWriter(output, 
encoding));
-            writer.println("ngram profile location:=" + new 
File(ngp.getName()).getCanonicalPath());
-            writer.flush();
-        }
-    };
-
     private ParseContext context;
     
     private Detector detector;
@@ -335,8 +319,6 @@ public class TikaCLI {
 
     private boolean recursiveJSON = false;
     
-    private LanguageProfilerBuilder ngp = null;
-
     /**
      * Output character encoding, or <code>null</code> for platform default
      */
@@ -355,8 +337,6 @@ public class TikaCLI {
 
     private boolean fork = false;
 
-    private String profileName = null;
-
     private boolean prettyPrint;
     
     public TikaCLI() throws Exception {
@@ -474,9 +454,6 @@ public class TikaCLI {
         } else if (arg.startsWith("--client=")) {
             URI uri = new URI(arg.substring("--client=".length()));
             parser = new NetworkParser(uri);
-        } else if(arg.startsWith("--create-profile=")){
-            profileName = arg.substring("--create-profile=".length());
-            type = CREATE_PROFILE;
         } else {
             pipeMode = false;
             if (serverMode) {
@@ -586,8 +563,6 @@ public class TikaCLI {
         out.println("    -r  or --pretty-print  For JSON, XML and XHTML 
outputs, adds newlines and");
         out.println("                           whitespace, for better 
readability");
         out.println();
-        out.println("    --create-profile=X");
-        out.println("         Create NGram profile, where X is a profile 
name");
         out.println("    --list-parsers");
         out.println("         List the available document parsers");
         out.println("    --list-parser-details");

http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 9fc8ee8..4a68475 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -54,18 +54,6 @@ public class TikaCLITest {
     }
 
     /**
-     * Creates a welsh language profile
-     * 
-     * @throws Exception
-     */
-    @Test
-    public void testCreateProfile() throws Exception {
-        String[] params = {"--create-profile=welsh", "-eUTF-8", resourcePrefix 
+ "welsh_corpus.txt"};
-        TikaCLI.main(params);
-        assertTrue(profile.exists());
-    }
-
-    /**
      * Tests --list-parser-detail option of the cli
      * 
      * @throws Exception

http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java 
b/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
deleted file mode 100644
index 00f6d06..0000000
--- a/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.language;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Set;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * Identifier of the language that best matches a given content profile.
- * The content profile is compared to generic language profiles based on
- * material from various sources.
- *
- * @since Apache Tika 0.5
- * @see <a href="http://www.iccs.inf.ed.ac.uk/~pkoehn/publications/europarl/";>
- *      Europarl: A Parallel Corpus for Statistical Machine Translation</a>
- * @see <a href="http://www.loc.gov/standards/iso639-2/php/code_list.php";>
- *      ISO 639 Language Codes</a>
- */
-public class LanguageIdentifier {
-    
-    /**
-     * The available language profiles.
-     */
-    private static final Map<String, LanguageProfile> PROFILES =
-        new HashMap<String, LanguageProfile>();
-    private static final String PROFILE_SUFFIX = ".ngp";
-
-    private static Properties props = new Properties();
-    private static String errors = "";
-    
-    private static final String PROPERTIES_OVERRIDE_FILE = 
"tika.language.override.properties";
-    private static final String PROPERTIES_FILE = "tika.language.properties";
-    private static final String LANGUAGES_KEY = "languages";
-    private static final double CERTAINTY_LIMIT = 0.022;
-
-    private final String language;
-
-    private final double distance;
-
-    /*
-     * Always attempt initializing language profiles when class is loaded 
first time
-     */
-    static {
-        initProfiles();
-    }
-    
-    /*
-     * Add one language profile based on config in property file
-     */
-    private static void addProfile(String language) throws Exception {
-        try {
-            LanguageProfile profile = new LanguageProfile();
-
-            try (InputStream stream =
-                    LanguageIdentifier.class.getResourceAsStream(
-                            language + PROFILE_SUFFIX)) {
-                BufferedReader reader =
-                    new BufferedReader(new InputStreamReader(stream, UTF_8));
-                String line = reader.readLine();
-                while (line != null) {
-                    if (line.length() > 0 && !line.startsWith("#")) {
-                        int space = line.indexOf(' ');
-                        profile.add(
-                                line.substring(0, space),
-                                Long.parseLong(line.substring(space + 1)));
-                    }
-                    line = reader.readLine();
-                }
-            }
-
-            addProfile(language, profile);
-        } catch (Throwable t) {
-            throw new Exception("Failed trying to load language profile for 
language \""+language+"\". Error: "+t.getMessage());
-        }
-    }
-    
-    /**
-     * Adds a single language profile
-     * @param language an ISO 639 code representing language
-     * @param profile the language profile
-     */
-    public static void addProfile(String language, LanguageProfile profile) {
-        PROFILES.put(language, profile);
-    }
-    
-    /**
-     * Constructs a language identifier based on a LanguageProfile
-     * @param profile the language profile
-     */
-    public LanguageIdentifier(LanguageProfile profile) {
-        String minLanguage = "unknown";
-        double minDistance = 1.0;
-        for (Map.Entry<String, LanguageProfile> entry : PROFILES.entrySet()) {
-            double distance = profile.distance(entry.getValue());
-            if (distance < minDistance) {
-                minDistance = distance;
-                minLanguage = entry.getKey();
-            }
-        }
-
-        this.language = minLanguage;
-        this.distance = minDistance;
-    }
-
-    /**
-     * Constructs a language identifier based on a String of text content
-     * @param content the text
-     */
-    public LanguageIdentifier(String content) {
-        this(new LanguageProfile(content));
-    }
-
-    /**
-     * Gets the identified language
-     * @return an ISO 639 code representing the detected language
-     */
-    public String getLanguage() {
-        return language;
-    }
-
-    /**
-     * Tries to judge whether the identification is certain enough
-     * to be trusted.
-     * WARNING: Will never return true for small amount of input texts. 
-     * @return <code>true</code> if the distance is smaller then {@value 
#CERTAINTY_LIMIT}, <code>false</code> otherwise
-     */
-    public boolean isReasonablyCertain() {
-        return distance < CERTAINTY_LIMIT;
-    }
-
-    /**
-     * Builds the language profiles.
-     * The list of languages are fetched from a property file named 
"tika.language.properties"
-     * If a file called "tika.language.override.properties" is found on 
classpath, this is used instead
-     * The property file contains a key "languages" with values being 
comma-separated language codes
-     */
-    public static void initProfiles() {
-        clearProfiles();
-        
-        errors = "";
-        InputStream stream;
-        stream = 
LanguageIdentifier.class.getResourceAsStream(PROPERTIES_OVERRIDE_FILE);
-        if(stream == null) {
-            stream = 
LanguageIdentifier.class.getResourceAsStream(PROPERTIES_FILE);
-        }
-
-        if(stream != null){
-            try {
-                props = new Properties();
-                props.load(stream);
-            } catch (IOException e) {
-                errors += "IOException while trying to load property file. 
Message: " + e.getMessage() + "\n";
-            }
-        }
-        
-        String[] languages = props.getProperty(LANGUAGES_KEY).split(",");
-        for(String language : languages) {
-            language = language.trim();
-            String name = props.getProperty("name."+language, "Unknown");
-            try {
-                addProfile(language);
-            } catch (Exception e) {
-                errors += "Language " + language + " (" + name + ") not 
initialized. Message: " + e.getMessage() + "\n";
-            }
-        }
-    }
-
-    /**
-     * Initializes the language profiles from a user supplied initialized Map.
-     * This overrides the default set of profiles initialized at startup,
-     * and provides an alternative to configuring profiles through property 
file
-     *
-     * @param profilesMap map of language profiles
-     */
-    public static void initProfiles(Map<String, LanguageProfile> profilesMap) {
-        clearProfiles();
-        for(Map.Entry<String, LanguageProfile> entry : profilesMap.entrySet()) 
{
-            addProfile(entry.getKey(), entry.getValue());
-        }
-    }
-    
-    /**
-     * Clears the current map of language profiles
-     */
-    public static void clearProfiles() {
-        PROFILES.clear();
-    }
-    
-    /**
-     * Tests whether there were errors initializing language config
-     * @return true if there are errors. Use getErrors() to retrieve.
-     */
-    public static boolean hasErrors() {
-        return errors != "";
-    }
-    
-    /**
-     * Returns a string of error messages related to initializing langauge 
profiles
-     * @return the String containing the error messages
-     */
-    public static String getErrors() {
-        return errors;
-    }
-    
-    /**
-     * Returns what languages are supported for language identification
-     * @return A set of Strings being the ISO 639 language codes
-     */
-    public static Set<String> getSupportedLanguages() {
-        return PROFILES.keySet();
-    }
-
-    @Override
-    public String toString() {
-        return language + " (" + distance + ")";
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java 
b/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java
deleted file mode 100644
index 9442920..0000000
--- a/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.language;
-
-
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.List;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-
-/**
- * Language profile based on ngram counts.
- *
- * @since Apache Tika 0.5
- */
-public class LanguageProfile {
-
-    public static final int DEFAULT_NGRAM_LENGTH = 3;
-
-    private final int length;
-
-    /**
-     * The ngrams that make up this profile.
-     */
-    private final Map<String, Counter> ngrams =
-        new HashMap<String, Counter>();
-
-    /**
-     * Sorted ngram cache for faster distance calculation.
-     */
-    private Interleaved interleaved = new Interleaved();
-    public static boolean useInterleaved = true; // For testing purposes
-
-    /**
-     * The sum of all ngram counts in this profile.
-     * Used to calculate relative ngram frequency.
-     */
-    private long count = 0;
-
-    private static class Counter {
-        private long count = 0;
-        public String toString() {
-            return Long.toString(count);
-        }
-    }
-
-    public LanguageProfile(int length) {
-        this.length = length;
-    }
-
-    public LanguageProfile() {
-        this(DEFAULT_NGRAM_LENGTH);
-    }
-
-    public LanguageProfile(String content, int length) {
-        this(length);
-
-        ProfilingWriter writer = new ProfilingWriter(this);
-        char[] ch = content.toCharArray();
-        writer.write(ch, 0, ch.length);
-    }
-
-    public LanguageProfile(String content) {
-        this(content, DEFAULT_NGRAM_LENGTH);
-    }
-
-    public long getCount() {
-        return count;
-    }
-
-    public long getCount(String ngram) {
-        Counter counter = ngrams.get(ngram);
-        if (counter != null) {
-            return counter.count;
-        } else {
-            return 0;
-        }
-    }
-
-    /**
-     * Adds a single occurrence of the given ngram to this profile.
-     *
-     * @param ngram the ngram
-     */
-    public void add(String ngram) {
-        add(ngram, 1);
-    }
-
-    /**
-     * Adds multiple occurrences of the given ngram to this profile.
-     *
-     * @param ngram the ngram
-     * @param count number of occurrences to add
-     */
-    public void add(String ngram, long count) {
-        if (length != ngram.length()) {
-            throw new IllegalArgumentException(
-                    "Unable to add an ngram of incorrect length: "
-                    + ngram.length() + " != " + length);
-        }
-
-        Counter counter = ngrams.get(ngram);
-        if (counter == null) {
-            counter = new Counter();
-            ngrams.put(ngram, counter);
-        }
-        counter.count += count;
-        this.count += count;
-    }
-
-    /**
-     * Calculates the geometric distance between this and the given
-     * other language profile.
-     *
-     * @param that the other language profile
-     * @return distance between the profiles
-     */
-    public double distance(LanguageProfile that) {
-        return useInterleaved ? distanceInterleaved(that) : 
distanceStandard(that);
-    }
-
-    private double distanceStandard(LanguageProfile that) {
-        if (length != that.length) {
-            throw new IllegalArgumentException(
-                    "Unable to calculage distance of language profiles"
-                    + " with different ngram lengths: "
-                    + that.length + " != " + length);
-        }
-
-        double sumOfSquares = 0.0;
-        double thisCount = Math.max(this.count, 1.0);
-        double thatCount = Math.max(that.count, 1.0);
-
-        Set<String> ngrams = new HashSet<String>();
-        ngrams.addAll(this.ngrams.keySet());
-        ngrams.addAll(that.ngrams.keySet());
-        for (String ngram : ngrams) {
-            double thisFrequency = this.getCount(ngram) / thisCount;
-            double thatFrequency = that.getCount(ngram) / thatCount;
-            double difference = thisFrequency - thatFrequency;
-            sumOfSquares += difference * difference;
-        }
-
-        return Math.sqrt(sumOfSquares);
-    }
-
-    @Override
-    public String toString() {
-        return ngrams.toString();
-    }
-
-    /* Code for interleaved distance calculation below */
-
-    private double distanceInterleaved(LanguageProfile that) {
-        if (length != that.length) {
-            throw new IllegalArgumentException(
-                    "Unable to calculage distance of language profiles"
-                    + " with different ngram lengths: "
-                    + that.length + " != " + length);
-        }
-       
-        double sumOfSquares = 0.0;
-        double thisCount = Math.max(this.count, 1.0);
-        double thatCount = Math.max(that.count, 1.0);
-        
-        Interleaved.Entry thisEntry = updateInterleaved().firstEntry();
-        Interleaved.Entry thatEntry = that.updateInterleaved().firstEntry();
-
-        // Iterate the lists in parallel, until both lists has been depleted
-        while (thisEntry.hasNgram() || thatEntry.hasNgram()) {
-            if (!thisEntry.hasNgram()) { // Depleted this
-                sumOfSquares += square(thatEntry.count / thatCount);
-                thatEntry.next();
-                continue;
-            }
-
-            if (!thatEntry.hasNgram()) { // Depleted that
-                sumOfSquares += square(thisEntry.count / thisCount);
-                thisEntry.next();
-                continue;
-            }
-
-            final int compare = thisEntry.compareTo(thatEntry);
-
-            if (compare == 0) { // Term exists both in this and that
-                double difference = thisEntry.count/thisCount - 
thatEntry.count/thatCount;
-                sumOfSquares += square(difference);
-                thisEntry.next();
-                thatEntry.next();
-            } else if (compare < 0) { // Term exists only in this
-                sumOfSquares += square(thisEntry.count/thisCount);
-                thisEntry.next();
-            } else { // Term exists only in that
-                sumOfSquares += square(thatEntry.count/thatCount);
-                thatEntry.next();
-            }
-        }
-        return Math.sqrt(sumOfSquares);
-    }
-    private double square(double count) {
-        return count * count;
-    }
-
-    private class Interleaved {
-
-        private char[] entries = null; // <ngram(length chars)><count(2 
chars)>*
-        private int size = 0; // Number of entries (one entry = length+2 chars)
-        private long entriesGeneratedAtCount = -1; // Keeps track of when the 
sequential structure was current
-
-        /**
-         * Ensure that the entries array is in sync with the ngrams.
-         */
-        public void update() {
-            if (count == entriesGeneratedAtCount) { // Already up to date
-                return;
-            }
-            size = ngrams.size();
-            final int numChars = (length+2)*size;
-            if (entries == null || entries.length < numChars) {
-                entries = new char[numChars];
-            }
-            int pos = 0;
-            for (Map.Entry<String, Counter> entry: getSortedNgrams()) {
-                for (int l = 0 ; l < length ; l++) {
-                    entries[pos + l] = entry.getKey().charAt(l);
-                }
-                entries[pos + length] = (char)(entry.getValue().count / 
65536); // Upper 16 bit
-                entries[pos + length + 1] = (char)(entry.getValue().count % 
65536); // lower 16 bit
-                pos += length + 2;
-            }
-            entriesGeneratedAtCount = count;
-        }
-
-        public Entry firstEntry() {
-            Entry entry = new Entry();
-            if (size > 0) {
-                entry.update(0);
-            }
-            return entry;
-        }
-        
-        private List<Map.Entry<String, Counter>> getSortedNgrams() {
-            List<Map.Entry<String, Counter>> entries = new 
ArrayList<Map.Entry<String, Counter>>(ngrams.size());
-            entries.addAll(ngrams.entrySet());
-            Collections.sort(entries, new Comparator<Map.Entry<String, 
Counter>>() {
-                @Override
-                public int compare(Map.Entry<String, Counter> o1, 
Map.Entry<String, Counter> o2) {
-                    return o1.getKey().compareTo(o2.getKey());
-                }
-            });
-            return entries;
-        }
-        
-        private class Entry implements Comparable<Entry> {
-            char[] ngram = new char[length];
-            int count = 0;
-            int pos = 0;
-
-            private void update(int pos) {
-                this.pos = pos;
-                if (pos >= size) { // Reached the end
-                    return;
-                }
-                final int origo = pos*(length+2);
-                System.arraycopy(entries, origo, ngram, 0, length);
-                count = entries[origo+length] * 65536 + 
entries[origo+length+1];
-            }
-
-            @Override
-            public int compareTo(Entry other) {
-                for (int i = 0 ; i < ngram.length ; i++) {
-                    if (ngram[i] != other.ngram[i]) {
-                        return ngram[i] - other.ngram[i];
-                    }
-                }
-                return 0;
-            }
-            public boolean hasNext() {
-                return pos < size-1;
-            }
-            public boolean hasNgram() {
-                return pos < size;
-            }
-            public void next() {
-                update(pos+1);
-            }
-            public String toString() {
-                return new String(ngram) + "(" + count + ")";
-            }
-        }
-    }
-    private Interleaved updateInterleaved() {
-        interleaved.update();
-        return interleaved;
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java 
b/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
deleted file mode 100644
index bac1f97..0000000
--- 
a/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
+++ /dev/null
@@ -1,770 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.language;
-
-// JDK imports
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.tika.exception.TikaException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * This class runs a ngram analysis over submitted text, results might be used
- * for automatic language identification.
- * 
- * The similarity calculation is at experimental level. You have been warned.
- * 
- * Methods are provided to build new NGramProfiles profiles.
- * 
- * @author Sami Siren
- * @author Jerome Charron - http://frutch.free.fr/
- */
-public class LanguageProfilerBuilder {
-
-    // public static final Log LOG =
-    // LogFactory.getLog(LanguageProfilerBuilder.class);
-
-    /** The minimum length allowed for a ngram. */
-    final static int ABSOLUTE_MIN_NGRAM_LENGTH = 3; /* was 1 */
-
-    /** The maximum length allowed for a ngram. */
-    final static int ABSOLUTE_MAX_NGRAM_LENGTH = 3; /* was 4 */
-
-    /** The default min length of ngram */
-    final static int DEFAULT_MIN_NGRAM_LENGTH = 3;
-
-    /** The default max length of ngram */
-    final static int DEFAULT_MAX_NGRAM_LENGTH = 3;
-
-    /** The ngram profile file extension */
-    final static String FILE_EXTENSION = "ngp";
-
-    /** The profile max size (number of ngrams of the same size) */
-    final static int MAX_SIZE = 1000;
-
-    /** separator char */
-    final static char SEPARATOR = '_';
-    /** The String form of the separator char */
-    private final static String SEP_CHARSEQ = new String(
-            new char[] { SEPARATOR });
-
-    /** The profile's name */
-    private String name = null;
-
-    /** The NGrams of this profile sorted on the number of occurrences */
-    private List<NGramEntry> sorted = null;
-
-    /** The min length of ngram */
-    private int minLength = DEFAULT_MIN_NGRAM_LENGTH;
-
-    /** The max length of ngram */
-    private int maxLength = DEFAULT_MAX_NGRAM_LENGTH;
-
-    /** The total number of ngrams occurences */
-    private int[] ngramcounts = null;
-
-    /** An index of the ngrams of the profile */
-    private Map<CharSequence, NGramEntry> ngrams = null;
-
-    /** A StringBuffer used during analysis */
-    private QuickStringBuffer word = new QuickStringBuffer();
-
-    /**
-     * Constructs a new ngram profile
-     * 
-     * @param name is the name of the profile
-     * @param minlen is the min length of ngram sequences
-     * @param maxlen is the max length of ngram sequences
-     */
-    public LanguageProfilerBuilder(String name, int minlen, int maxlen) {
-        // TODO: Compute the initial capacity using minlen and maxlen.
-        this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
-        this.minLength = minlen;
-        this.maxLength = maxlen;
-        this.name = name;
-    }
-  
-    /**
-     * Constructs a new ngram profile where minlen=3, maxlen=3
-     * 
-     * @param name is a name of profile, usually two length string
-     * @since Tika 1.0
-     */
-    public LanguageProfilerBuilder(String name) {
-        this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
-        this.minLength = ABSOLUTE_MIN_NGRAM_LENGTH;
-        this.maxLength = ABSOLUTE_MAX_NGRAM_LENGTH;
-        this.name = name;
-    }
-
-    /**
-     * @return Returns the name.
-     */
-    public String getName() {
-        return name;
-    }
-  
-    // This method was commented because it depends on 
org.apache.lucene.analysis.Token
-    // that is not a part of the Tika
-    // /**
-    // * Adds ngrams from a token to this profile
-    // *
-    // * @param t is the Token to be added
-    // */
-    // public void add(Token t) {
-    // add(new StringBuffer().append(SEPARATOR)
-    // .append(t.term())
-    // .append(SEPARATOR));
-    // }
-
-    /**
-     * Adds ngrams from a single word to this profile
-     * 
-     * @param word is the word to add
-     */
-    public void add(StringBuffer word) {
-        for (int i = minLength; (i <= maxLength) && (i < word.length()); i++) {
-            add(word, i);
-        }
-    }
-
-    /**
-     * Adds the last NGrams from the specified word.
-     */
-    private void add(QuickStringBuffer word) {
-        int wlen = word.length();
-        if (wlen >= minLength) {
-            int max = Math.min(maxLength, wlen);
-            for (int i = minLength; i <= max; i++) {
-                add(word.subSequence(wlen - i, wlen));
-            }
-        }
-    }
-
-    /**
-     * Adds ngrams from a single word in this profile
-     * 
-     * @param word is the word to add
-     * @param n is the ngram size
-     */
-    private void add(CharSequence cs) {
-
-        if (cs.equals(SEP_CHARSEQ)) {
-            return;
-        }
-        NGramEntry nge = ngrams.get(cs);
-        if (nge == null) {
-            nge = new NGramEntry(cs);
-            ngrams.put(cs, nge);
-        }
-        nge.inc();
-    }
-
-    /**
-     * Analyzes a piece of text
-     * 
-     * @param text
-     *            the text to be analyzed
-     */
-    public void analyze(StringBuilder text) {
-
-        if (ngrams != null) {
-            ngrams.clear();
-            sorted = null;
-            ngramcounts = null;
-        }
-
-        word.clear().append(SEPARATOR);
-        for (int i = 0; i < text.length(); i++) {
-            char c = Character.toLowerCase(text.charAt(i));
-
-            if (Character.isLetter(c)) {
-                add(word.append(c));
-            } else {
-                // found word boundary
-                if (word.length() > 1) {
-                    // we have a word!
-                    add(word.append(SEPARATOR));
-                    word.clear().append(SEPARATOR);
-                }
-            }
-        }
-
-        if (word.length() > 1) {
-            // we have a word!
-            add(word.append(SEPARATOR));
-        }
-        normalize();
-    }
-
-    /**
-     * @param word
-     * @param n sequence length
-     */
-    private void add(StringBuffer word, int n) {
-        for (int i = 0; i <= word.length() - n; i++) {
-            add(word.subSequence(i, i + n));
-        }
-    }
-    
-    /**
-     * Normalizes the profile (calculates the ngrams frequencies)
-     */
-    protected void normalize() {
-        NGramEntry e = null;
-        Iterator<NGramEntry> i = ngrams.values().iterator();
-
-        // Calculates ngram count if not already done
-        if (ngramcounts == null) {
-            ngramcounts = new int[maxLength + 1];
-            while (i.hasNext()) {
-                e = i.next();
-                ngramcounts[e.size()] += e.count;
-            }
-        }
-
-        i = ngrams.values().iterator();
-        while (i.hasNext()) {
-            e = i.next();
-            e.frequency = (float) e.count / (float) ngramcounts[e.size()];
-        }
-    }
-
-    /**
-     * Returns a sorted list of ngrams (sort done by 1. frequency 2. sequence)
-     * 
-     * @return sorted vector of ngrams
-     */
-    public List<NGramEntry> getSorted() {
-        // make sure sorting is done only once
-        if (sorted == null) {
-            sorted = new ArrayList<NGramEntry>(ngrams.values());
-            Collections.sort(sorted);
-
-            // trim at NGRAM_LENGTH entries
-            if (sorted.size() > MAX_SIZE) {
-                sorted = sorted.subList(0, MAX_SIZE);
-            }
-        }
-        return sorted;
-    }
-
-    // Inherited JavaDoc
-    public String toString() {
-
-        StringBuffer s = new StringBuffer().append("NGramProfile: ")
-                                           .append(name).append("\n");
-
-        Iterator<NGramEntry> i = getSorted().iterator();
-
-        while (i.hasNext()) {
-            NGramEntry entry = i.next();
-            s.append("[").append(entry.seq).append("/").append(entry.count)
-                         .append("/").append(entry.frequency).append("]\n");
-        }
-        return s.toString();
-    }
-
-    /**
-     * Calculates a score how well NGramProfiles match each other
-     * 
-     * @param another
-     *            ngram profile to compare against
-     * @return similarity 0=exact match
-     * @throws TikaException
-     *             if could not calculate a score
-     */
-    public float getSimilarity(LanguageProfilerBuilder another)
-            throws TikaException {
-
-        float sum = 0;
-
-        try {
-            Iterator<NGramEntry> i = another.getSorted().iterator();
-            while (i.hasNext()) {
-                NGramEntry other = i.next();
-                if (ngrams.containsKey(other.seq)) {
-                    sum += Math.abs((other.frequency - 
ngrams.get(other.seq).frequency)) / 2;
-                } else {
-                    sum += other.frequency;
-                }
-            }
-            i = getSorted().iterator();
-            while (i.hasNext()) {
-                NGramEntry other = i.next();
-                if (another.ngrams.containsKey(other.seq)) {
-                    sum += Math.abs((other.frequency - another.ngrams
-                            .get(other.seq).frequency)) / 2;
-                } else {
-                    sum += other.frequency;
-                }
-            }
-        } catch (Exception e) {
-            throw new TikaException("Could not calculate a score how well 
NGramProfiles match each other");
-        }
-        return sum;
-    }
-
-    /**
-     * Loads a ngram profile from an InputStream (assumes UTF-8 encoded 
content)
-     * 
-     * @param is the InputStream to read
-     */
-    public void load(InputStream is) throws IOException {
-
-        ngrams.clear();
-        ngramcounts = new int[maxLength + 1];
-        BufferedReader reader = new BufferedReader(new InputStreamReader(is, 
UTF_8));
-        String line = null;
-
-        while ((line = reader.readLine()) != null) {
-
-            // # starts a comment line
-            if (line.charAt(0) != '#') {
-                int spacepos = line.indexOf(' ');
-                String ngramsequence = line.substring(0, spacepos).trim();
-                int len = ngramsequence.length();
-                if ((len >= minLength) && (len <= maxLength)) {
-                    int ngramcount = Integer.parseInt(line.substring(spacepos 
+ 1));
-                    NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
-                    ngrams.put(en.getSeq(), en);
-                    ngramcounts[len] += ngramcount;
-                }
-            }
-        }
-        normalize();
-    }
-    
-    /**
-     * Creates a new Language profile from (preferably quite large - 5-10k of
-     * lines) text file
-     * 
-     * @param name to be given for the profile
-     * @param is a stream to be read
-     * @param encoding is the encoding of stream
-     * 
-     * @throws TikaException if could not create a language profile
-     *  
-     */
-    public static LanguageProfilerBuilder create(String name, InputStream is, 
String encoding) throws TikaException {
-
-        LanguageProfilerBuilder newProfile = new LanguageProfilerBuilder(name,
-                ABSOLUTE_MIN_NGRAM_LENGTH, ABSOLUTE_MAX_NGRAM_LENGTH);
-        BufferedInputStream bis = new BufferedInputStream(is);
-
-        byte buffer[] = new byte[4096];
-        StringBuilder text = new StringBuilder();
-        int len;
-
-        try {
-            while ((len = bis.read(buffer)) != -1) {
-                text.append(new String(buffer, 0, len, encoding));
-            }
-        } catch (IOException e) {
-            throw new TikaException("Could not create profile, " + 
e.getMessage());
-        }
-
-        newProfile.analyze(text);
-        return newProfile;
-    }
-
-    /**
-     * Writes NGramProfile content into OutputStream, content is outputted with
-     * UTF-8 encoding
-     * 
-     * @param os the Stream to output to
-     * 
-     * @throws IOException
-     */
-    public void save(OutputStream os) throws IOException {
-        os.write(("# NgramProfile generated at " + new Date() + 
-                  " for Apache Tika Language 
Identification\n").getBytes(UTF_8));
-
-        // And then each ngram
-
-        // First dispatch ngrams in many lists depending on their size
-        // (one list for each size, in order to store MAX_SIZE ngrams for each
-        // size of ngram)
-        List<NGramEntry> list = new ArrayList<NGramEntry>();
-        List<NGramEntry> sublist = new ArrayList<NGramEntry>();
-        NGramEntry[] entries = ngrams.values().toArray(
-                new NGramEntry[ngrams.size()]);
-        for (int i = minLength; i <= maxLength; i++) {
-            for (int j = 0; j < entries.length; j++) {
-                if (entries[j].getSeq().length() == i) {
-                    sublist.add(entries[j]);
-                }
-            }
-            Collections.sort(sublist);
-            if (sublist.size() > MAX_SIZE) {
-                sublist = sublist.subList(0, MAX_SIZE);
-            }
-            list.addAll(sublist);
-            sublist.clear();
-        }
-        for (int i = 0; i < list.size(); i++) {
-            NGramEntry e = list.get(i);
-            String line = e.toString() + " " + e.getCount() + "\n";
-            os.write(line.getBytes(UTF_8));
-        }
-        os.flush();
-    }
-
-    /**
-     * main method used for testing only
-     * 
-     * @param args
-     */
-    public static void main(String args[]) {
-
-        // -create he sample_he.txt utf-8
-
-        String usage = "Usage: NGramProfile "
-                + "[-create profilename filename encoding] "
-                + "[-similarity file1 file2] "
-                + "[-score profile-name filename encoding]";
-        int command = 0;
-
-        final int CREATE = 1;
-        final int SIMILARITY = 2;
-        final int SCORE = 3;
-
-        String profilename = "";
-        String filename = "";
-        String filename2 = "";
-        String encoding = "";
-
-        if (args.length == 0) {
-            System.err.println(usage);
-            System.exit(-1);
-        }
-
-        for (int i = 0; i < args.length; i++) { // parse command line
-            if (args[i].equals("-create")) { // found -create option
-                command = CREATE;
-                profilename = args[++i];
-                filename = args[++i];
-                encoding = args[++i];
-            }
-
-            if (args[i].equals("-similarity")) { // found -similarity option
-                command = SIMILARITY;
-                filename = args[++i];
-                filename2 = args[++i];
-                encoding = args[++i];
-            }
-
-            if (args[i].equals("-score")) { // found -Score option
-                command = SCORE;
-                profilename = args[++i];
-                filename = args[++i];
-                encoding = args[++i];
-            }
-        }
-
-        try {
-
-            switch (command) {
-
-            case CREATE:
-
-                File f = new File(filename);
-                FileInputStream fis = new FileInputStream(f);
-                LanguageProfilerBuilder newProfile = LanguageProfilerBuilder
-                        .create(profilename, fis, encoding);
-                fis.close();
-                f = new File(profilename + "." + FILE_EXTENSION);
-                FileOutputStream fos = new FileOutputStream(f);
-                newProfile.save(fos);
-                System.out.println("new profile " + profilename + "."
-                        + FILE_EXTENSION + " was created.");
-                break;
-
-            case SIMILARITY:
-
-                f = new File(filename);
-                fis = new FileInputStream(f);
-                newProfile = LanguageProfilerBuilder.create(filename, fis,
-                        encoding);
-                newProfile.normalize();
-
-                f = new File(filename2);
-                fis = new FileInputStream(f);
-                LanguageProfilerBuilder newProfile2 = LanguageProfilerBuilder
-                        .create(filename2, fis, encoding);
-                newProfile2.normalize();
-                System.out.println("Similarity is "
-                        + newProfile.getSimilarity(newProfile2));
-                break;
-
-            case SCORE:
-                f = new File(filename);
-                fis = new FileInputStream(f);
-                newProfile = LanguageProfilerBuilder.create(filename, fis,
-                        encoding);
-
-                f = new File(profilename + "." + FILE_EXTENSION);
-                fis = new FileInputStream(f);
-                LanguageProfilerBuilder compare = new LanguageProfilerBuilder(
-                        profilename, DEFAULT_MIN_NGRAM_LENGTH,
-                        DEFAULT_MAX_NGRAM_LENGTH);
-                compare.load(fis);
-                System.out.println("Score is "
-                        + compare.getSimilarity(newProfile));
-                break;
-
-            }
-
-        } catch (Exception e) {
-            e.printStackTrace();
-            // throw new TikaException("");
-        }
-    }
-
-  
-    /**
-     * Inner class that describes a NGram
-     */
-    static class NGramEntry implements Comparable<NGramEntry> {
-
-        /** The NGRamProfile this NGram is related to */
-        private LanguageProfilerBuilder profile = null;
-
-        /** The sequence of characters of the ngram */
-        CharSequence seq = null;
-
-        /** The number of occurences of this ngram in its profile */
-        private int count = 0;
-
-        /** The frequency of this ngram in its profile */
-        private float frequency = 0.0F;
-
-        /**
-         * Constructs a new NGramEntry
-         * 
-         * @param seq is the sequence of characters of the ngram
-         */
-        public NGramEntry(CharSequence seq) {
-            this.seq = seq;
-        }
-
-        /**
-         * Constructs a new NGramEntry
-         * 
-         * @param seq is the sequence of characters of the ngram
-         * @param count is the number of occurrences of this ngram
-         */
-        public NGramEntry(String seq, int count) {
-            this.seq = new StringBuffer(seq).subSequence(0, seq.length());
-            this.count = count;
-        }
-
-        /**
-         * Returns the number of occurrences of this ngram in its profile
-         * 
-         * @return the number of occurrences of this ngram in its profile
-         */
-        public int getCount() {
-            return count;
-        }
-
-        /**
-         * Returns the frequency of this ngram in its profile
-         * 
-         * @return the frequency of this ngram in its profile
-         */
-        public float getFrequency() {
-            return frequency;
-        }
-
-        /**
-         * Returns the sequence of characters of this ngram
-         * 
-         * @return the sequence of characters of this ngram
-         */
-        public CharSequence getSeq() {
-            return seq;
-        }
-
-        /**
-         * Returns the size of this ngram
-         * 
-         * @return the size of this ngram
-         */
-        public int size() {
-            return seq.length();
-        }
-
-        // Inherited JavaDoc
-        public int compareTo(NGramEntry ngram) {
-            int diff = Float.compare(ngram.getFrequency(), frequency);
-            if (diff != 0) {
-                return diff;
-            } else {
-                return (toString().compareTo(ngram.toString()));
-            }
-        }
-
-        /**
-         * Increments the number of occurrences of this ngram.
-         */
-        public void inc() {
-            count++;
-        }
-
-        /**
-         * Associated a profile to this ngram
-         * 
-         * @param profile
-         *            is the profile associated to this ngram
-         */
-        public void setProfile(LanguageProfilerBuilder profile) {
-            this.profile = profile;
-        }
-
-        /**
-         * Returns the profile associated to this ngram
-         * 
-         * @return the profile associated to this ngram
-         */
-        public LanguageProfilerBuilder getProfile() {
-            return profile;
-        }
-
-        // Inherited JavaDoc
-        public String toString() {
-            return seq.toString();
-        }
-
-        // Inherited JavaDoc
-        public int hashCode() {
-            return seq.hashCode();
-        }
-
-        // Inherited JavaDoc
-        public boolean equals(Object obj) {
-
-            NGramEntry ngram = null;
-            try {
-                ngram = (NGramEntry) obj;
-                return ngram.seq.equals(seq);
-            } catch (Exception e) {
-                return false;
-            }
-        }
-
-    }
-
-    private static class QuickStringBuffer implements CharSequence {
-
-        private char value[];
-
-        private int count;
-
-        QuickStringBuffer() {
-            this(16);
-        }
-
-        QuickStringBuffer(char[] value) {
-            this.value = value;
-            count = value.length;
-        }
-
-        QuickStringBuffer(int length) {
-            value = new char[length];
-        }
-
-        QuickStringBuffer(String str) {
-            this(str.length() + 16);
-            append(str);
-        }
-
-        public int length() {
-            return count;
-        }
-
-        private void expandCapacity(int minimumCapacity) {
-            int newCapacity = (value.length + 1) * 2;
-            if (newCapacity < 0) {
-                newCapacity = Integer.MAX_VALUE;
-            } else if (minimumCapacity > newCapacity) {
-                newCapacity = minimumCapacity;
-            }
-
-            char newValue[] = new char[newCapacity];
-            System.arraycopy(value, 0, newValue, 0, count);
-            value = newValue;
-        }
-
-        QuickStringBuffer clear() {
-            count = 0;
-            return this;
-        }
-
-        public char charAt(int index) {
-            return value[index];
-        }
-
-        QuickStringBuffer append(String str) {
-            if (str == null) {
-                str = String.valueOf(str);
-            }
-
-            int len = str.length();
-            int newcount = count + len;
-            if (newcount > value.length) {
-                expandCapacity(newcount);
-            }
-            str.getChars(0, len, value, count);
-            count = newcount;
-            return this;
-        }
-
-        QuickStringBuffer append(char c) {
-            int newcount = count + 1;
-            if (newcount > value.length) {
-                expandCapacity(newcount);
-            }
-            value[count++] = c;
-            return this;
-        }
-
-        public CharSequence subSequence(int start, int end) {
-            return new String(value, start, end - start);
-        }
-
-        public String toString() {
-            return new String(this.value);
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java 
b/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java
deleted file mode 100644
index 5c5b9d2..0000000
--- a/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.language;
-
-import org.apache.tika.sax.WriteOutContentHandler;
-
-/**
- * SAX content handler that builds a language profile based on all the
- * received character content.
- *
- * @since Apache Tika 0.5
- */
-public class ProfilingHandler extends WriteOutContentHandler {
-
-    private final ProfilingWriter writer;
-
-    public ProfilingHandler(ProfilingWriter writer) {
-        super(writer);
-        this.writer = writer;
-    }
-
-    public ProfilingHandler(LanguageProfile profile) {
-        this(new ProfilingWriter(profile));
-    }
-
-    public ProfilingHandler() {
-        this(new ProfilingWriter());
-    }
-
-    /**
-     * Returns the language profile being built by this content handler.
-     * Note that the returned profile gets updated whenever new SAX events
-     * are received by this content handler. Use the {@link #getLanguage()}
-     * method to get the language that best matches the current state of
-     * the profile.
-     *
-     * @return language profile
-     */
-    public LanguageProfile getProfile() {
-        return writer.getProfile();
-    }
-
-    /**
-     * Returns the language that best matches the current state of the
-     * language profile.
-     *
-     * @return language that best matches the current profile
-     */
-    public LanguageIdentifier getLanguage() {
-        return writer.getLanguage();
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java 
b/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java
deleted file mode 100644
index 33ce707..0000000
--- a/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.language;
-
-import java.io.IOException;
-import java.io.Writer;
-
-/**
- * Writer that builds a language profile based on all the written content.
- *
- * @since Apache Tika 0.5
- */
-public class ProfilingWriter extends Writer {
-
-    private final LanguageProfile profile;
-
-    private char[] buffer = new char[] { 0, 0, '_' };
-
-    private int n = 1;
-
-    public ProfilingWriter(LanguageProfile profile) {
-        this.profile = profile;
-    }
-
-    public ProfilingWriter() {
-        this(new LanguageProfile());
-    }
-
-    /**
-     * Returns the language profile being built by this writer. Note that
-     * the returned profile gets updated whenever new characters are written.
-     * Use the {@link #getLanguage()} method to get the language that best
-     * matches the current state of the profile.
-     *
-     * @return language profile
-     */
-    public LanguageProfile getProfile() {
-        return profile;
-    }
-
-    /**
-     * Returns the language that best matches the current state of the
-     * language profile.
-     *
-     * @return language that best matches the current profile
-     */
-    public LanguageIdentifier getLanguage() {
-        return new LanguageIdentifier(profile);
-    }
-
-    @Override
-    public void write(char[] cbuf, int off, int len) {
-        for (int i = 0; i < len; i++) {
-            char c = Character.toLowerCase(cbuf[off + i]);
-            if (Character.isLetter(c)) {
-                addLetter(c);
-            } else {
-                addSeparator();
-            }
-        }
-    }
-
-    private void addLetter(char c) {
-        System.arraycopy(buffer, 1, buffer, 0, buffer.length - 1);
-        buffer[buffer.length - 1] = c;
-        n++;
-        if (n >= buffer.length) {
-            profile.add(new String(buffer));
-        }
-    }
-
-    private void addSeparator() {
-        addLetter('_');
-        n = 1;
-    }
-
-    @Override
-    public void close() throws IOException {
-        addSeparator();
-    }
-
-    /**
-     * Ignored.
-     */
-    @Override
-    public void flush() {
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/main/java/org/apache/tika/language/package-info.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/language/package-info.java 
b/tika-core/src/main/java/org/apache/tika/language/package-info.java
deleted file mode 100644
index f8dc4bf..0000000
--- a/tika-core/src/main/java/org/apache/tika/language/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Language detection.
- */
[email protected]("1.0.0")
-package org.apache.tika.language;

http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java 
b/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
deleted file mode 100644
index 0c5834b..0000000
--- 
a/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.language;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Writer;
-import java.util.HashMap;
-import java.util.Locale;
-
-import org.apache.tika.io.IOUtils;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * JUnit based test of class {@link LanguageIdentifier}.
- *
- * @author Sami Siren
- * @author Jerome Charron - http://frutch.free.fr/
- */
-public class LanguageIdentifierTest {
-
-    private static final String[] languages = new String[] {
-        // TODO - currently Estonian and Greek fail these tests.
-        // Enable when language detection works better.
-        "da", "de", /* "et", "el", */ "en", "es", "fi", "fr", "it",
-        "lt", "nl", "pt", "sv"
-    };
-
-    @Before
-    public void setUp() {
-        LanguageIdentifier.initProfiles();
-    }
-
-    @Test
-    public void testLanguageDetection() throws IOException {
-        for (String language : languages) {
-            ProfilingWriter writer = new ProfilingWriter();
-            writeTo(language, writer);
-            LanguageIdentifier identifier = null;
-            identifier = new LanguageIdentifier(writer.getProfile());
-            assertEquals(language, identifier.getLanguage());
-            // Lithuanian is detected but isn't reasonably certain:
-            if (!language.equals("lt")) {
-                assertTrue(identifier.toString(), 
identifier.isReasonablyCertain());
-            }
-        }
-    }
-
-    @Test
-    public void testClearAddAndInitProfiles() throws IOException {
-        // Prepare english and german language profiles
-        ProfilingWriter enWriter = new ProfilingWriter();
-        writeTo("en", enWriter);
-        LanguageProfile enProfile = enWriter.getProfile();
-        ProfilingWriter deWriter = new ProfilingWriter();
-        writeTo("de", deWriter);
-        LanguageProfile deProfile = deWriter.getProfile();
-
-        // Out of the box profiles
-        LanguageIdentifier identifier = null;
-        identifier = new LanguageIdentifier(enProfile);
-        assertEquals("en", identifier.getLanguage());
-        assertTrue(identifier.isReasonablyCertain());
-
-        // No profiles
-        LanguageIdentifier.clearProfiles();
-        identifier = new LanguageIdentifier(enProfile);
-        assertFalse(identifier.isReasonablyCertain());
-
-        // Only English profile
-        LanguageIdentifier.addProfile("en", enProfile);
-        identifier = new LanguageIdentifier(enProfile);
-        assertEquals("en", identifier.getLanguage());
-        assertTrue(identifier.isReasonablyCertain());
-
-        // English and German profiles loaded explicitly from initProfiles 
method
-        HashMap<String, LanguageProfile> profilesMap = new HashMap<String, 
LanguageProfile>();
-        profilesMap.put("en", enProfile);
-        profilesMap.put("de", deProfile);
-        LanguageIdentifier.initProfiles(profilesMap);
-        identifier = new LanguageIdentifier(enProfile);
-        assertEquals("en", identifier.getLanguage());
-        assertTrue(identifier.isReasonablyCertain());
-        identifier = new LanguageIdentifier(deProfile);
-        assertEquals("de", identifier.getLanguage());
-        assertTrue(identifier.isReasonablyCertain());
-  }
-
-    // Enable this to compare performance
-    public void testPerformance() throws IOException {
-        final int MRUNS = 8;
-        final int IRUNS = 10;
-        int detected = 0; // To avoid code removal by JVM or compiler
-        String lastResult = null;
-        for (int m = 0 ; m < MRUNS ; m++) {
-            LanguageProfile.useInterleaved = (m & 1) == 1; // Alternate 
between standard and interleaved
-            String currentResult = "";
-            final long start = System.nanoTime();
-            for (int i = 0 ; i < IRUNS ; i++) {
-                for (String language : languages) {
-                    ProfilingWriter writer = new ProfilingWriter();
-                    writeTo(language, writer);
-                    LanguageIdentifier identifier = new 
LanguageIdentifier(writer.getProfile());
-                    if (identifier.isReasonablyCertain()) {
-                        currentResult += identifier.getLanguage();
-                        detected++;
-                    }
-                }
-            }
-            System.out.println(String.format(Locale.ROOT, 
-                    "Performed %d detections at %2d ms/test with 
interleaved=%b",
-                    languages.length*IRUNS, 
(System.nanoTime()-start)/1000000/(languages.length*IRUNS),
-                                            LanguageProfile.useInterleaved));
-            if (lastResult != null) { // Might as well test that they behave 
the same while we're at it
-                assertEquals("This result should be equal to the last", 
lastResult, currentResult);
-            }
-            lastResult = currentResult;
-        }
-        if (detected == -1) {
-            System.out.println("Never encountered but keep it to guard against 
over-eager optimization");
-        }
-    }
-
-    @Test
-    public void testMixedLanguages() throws IOException {
-        for (String language : languages) {
-            for (String other : languages) {
-                if (!language.equals(other)) {
-                    if (language.equals("lt") || other.equals("lt")) {
-                        continue;
-                    }
-                    ProfilingWriter writer = new ProfilingWriter();
-                    writeTo(language, writer);
-                    writeTo(other, writer);
-                    LanguageIdentifier identifier = null;
-                    identifier = new LanguageIdentifier(writer.getProfile());
-                    assertFalse("mix of " + language + " and " + other + " 
incorrectly detected as " + identifier, identifier.isReasonablyCertain());
-                }
-            }
-        }
-    }
-
-    // TIKA-453: Fix up language identifier used for Estonian
-    @Test
-    public void testEstonia() throws Exception {
-        final String estonian = "et";
-        ProfilingWriter writer = new ProfilingWriter();
-        writeTo(estonian, writer);
-        LanguageIdentifier identifier =
-            new LanguageIdentifier(writer.getProfile());
-        assertEquals(estonian, identifier.getLanguage());
-    }
-
-    private void writeTo(String language, Writer writer) throws IOException {
-        try (InputStream stream =
-                LanguageIdentifierTest.class.getResourceAsStream(
-                        language + ".test")) {
-            IOUtils.copy(new InputStreamReader(stream, UTF_8), writer);
-        }
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/test/java/org/apache/tika/language/LanguageProfileTest.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/test/java/org/apache/tika/language/LanguageProfileTest.java 
b/tika-core/src/test/java/org/apache/tika/language/LanguageProfileTest.java
deleted file mode 100644
index 795eb85..0000000
--- a/tika-core/src/test/java/org/apache/tika/language/LanguageProfileTest.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.language;
-
-import java.io.IOException;
-
-import org.junit.Test;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-public class LanguageProfileTest {
-
-    @Test
-    public void testLanguageProfile() throws IOException {
-        LanguageProfile foo = new LanguageProfile();
-        assertEquals(0, foo.getCount("foo"));
-
-        foo.add("foo");
-        assertEquals(1, foo.getCount("foo"));
-
-        foo.add("foo", 3);
-        assertEquals(4, foo.getCount("foo"));
-
-        LanguageProfile bar = new LanguageProfile();
-        assertEquals(1.0, foo.distance(bar), 1e-8);
-
-        bar.add("bar");
-        assertEquals(Math.sqrt(2.0), foo.distance(bar), 1e-8);
-
-        bar.add("bar", 3);
-        assertEquals(Math.sqrt(2.0), foo.distance(bar), 1e-8);
-
-        LanguageProfile foobar = new LanguageProfile();
-        assertTrue(foo.distance(foobar) == bar.distance(foobar));
-
-        foobar.add("foo");
-        assertTrue( foo.distance(foobar) < bar.distance(foobar));
-
-        foobar.add("bar");
-        assertTrue(foo.distance(foobar) == bar.distance(foobar));
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
 
b/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
deleted file mode 100644
index 39ba686..0000000
--- 
a/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.language;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.net.URISyntaxException;
-
-import org.apache.tika.exception.TikaException;
-import org.junit.After;
-import org.junit.Test;
-
-public class LanguageProfilerBuilderTest {
-    /* Test members */
-    private LanguageProfilerBuilder ngramProfile = null;
-    private LanguageProfile langProfile = null;
-    private final String profileName = 
"../tika-core/src/test/resources/org/apache/tika/language/langbuilder/"
-            + LanguageProfilerBuilderTest.class.getName();
-    private final String corpusName = "langbuilder/welsh_corpus.txt";
-    private final String FILE_EXTENSION = "ngp";
-    private final String LANGUAGE = "welsh";
-    private final int maxlen = 1000;
-
-    @Test
-    public void testCreateProfile() throws TikaException, IOException, 
URISyntaxException {
-        try (InputStream is = 
LanguageProfilerBuilderTest.class.getResourceAsStream(corpusName)) {
-            ngramProfile = LanguageProfilerBuilder.create(profileName, is, 
UTF_8.name());
-        }
-
-        File f = new File(profileName + "." + FILE_EXTENSION);
-        FileOutputStream fos = new FileOutputStream(f);
-        ngramProfile.save(fos);
-        fos.close();
-        assertEquals(maxlen, ngramProfile.getSorted().size());
-    }
-
-    @Test
-    public void testNGramProfile() throws IOException, TikaException, 
URISyntaxException {
-        createLanguageProfile();
-        LanguageIdentifier.addProfile(LANGUAGE, langProfile);
-        LanguageIdentifier identifier = new LanguageIdentifier(langProfile);
-        assertEquals(LANGUAGE, identifier.getLanguage());
-        assertTrue(identifier.isReasonablyCertain());
-    }
-
-    private void createLanguageProfile() throws IOException, TikaException, 
URISyntaxException {
-        // Sort of dependency injection
-        if (ngramProfile == null)
-            testCreateProfile();
-
-        langProfile = new LanguageProfile();
-
-        try (InputStream stream = new FileInputStream(new File(profileName + 
"." + FILE_EXTENSION))) {
-            BufferedReader reader = new BufferedReader(new InputStreamReader(
-                    stream, UTF_8));
-            String line = reader.readLine();
-            while (line != null) {
-                if (line.length() > 0 && !line.startsWith("#")) {// skips the
-                                                                 // ngp
-                                                                 // 
header/comment
-                    int space = line.indexOf(' ');
-                    langProfile.add(line.substring(0, space),
-                            Long.parseLong(line.substring(space + 1)));
-                }
-                line = reader.readLine();
-            }
-        }
-    }
-
-    @After
-    public void tearDown() throws Exception {
-        File profile = new File(profileName + "." + FILE_EXTENSION);
-        if (profile.exists())
-            profile.delete();
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java 
b/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java
deleted file mode 100644
index 5ffcb0f..0000000
--- a/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.language;
-
-import java.io.IOException;
-
-import org.junit.Test;
-
-import static org.junit.Assert.assertEquals;
-
-public class ProfilingWriterTest {
-
-    @Test
-    public void testProfilingWriter() throws IOException {
-        ProfilingWriter writer = new ProfilingWriter();
-        writer.write(" foo+BAR FooBar\n");
-        writer.close();
-
-        LanguageProfile profile = writer.getProfile();
-        assertEquals(2, profile.getCount("_fo"));
-        assertEquals(2, profile.getCount("foo"));
-        assertEquals(1, profile.getCount("oo_"));
-        assertEquals(1, profile.getCount("oob"));
-        assertEquals(1, profile.getCount("oba"));
-        assertEquals(1, profile.getCount("_ba"));
-        assertEquals(2, profile.getCount("bar"));
-        assertEquals(2, profile.getCount("ar_"));
-    }
-
-}

[17/34] tika git commit: Remove built-in lang detector

Reply via email to