Remove built-in lang detector And update all code to use new language detector API.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3a7a94ca Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3a7a94ca Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3a7a94ca Branch: refs/heads/master Commit: 3a7a94ca5040eabd90f6060effc517126def3fc1 Parents: e38512e Author: Ken Krugler <[email protected]> Authored: Thu Feb 11 11:54:39 2016 -0800 Committer: trevorlewis <[email protected]> Committed: Mon Mar 7 11:43:33 2016 -0800 ---------------------------------------------------------------------- .../main/java/org/apache/tika/cli/TikaCLI.java | 41 +- .../java/org/apache/tika/cli/TikaCLITest.java | 12 - .../tika/language/LanguageIdentifier.java | 240 -- .../apache/tika/language/LanguageProfile.java | 314 --- .../tika/language/LanguageProfilerBuilder.java | 770 ------ .../apache/tika/language/ProfilingHandler.java | 67 - .../apache/tika/language/ProfilingWriter.java | 103 - .../org/apache/tika/language/package-info.java | 22 - .../tika/language/LanguageIdentifierTest.java | 183 -- .../tika/language/LanguageProfileTest.java | 58 - .../language/LanguageProfilerBuilderTest.java | 100 - .../tika/language/ProfilingWriterTest.java | 44 - .../resources/org/apache/tika/language/da.test | 108 - .../resources/org/apache/tika/language/de.test | 104 - .../resources/org/apache/tika/language/el.test | 109 - .../resources/org/apache/tika/language/en.test | 105 - .../resources/org/apache/tika/language/es.test | 107 - .../resources/org/apache/tika/language/et.test | 17 - .../resources/org/apache/tika/language/fi.test | 106 - .../resources/org/apache/tika/language/fr.test | 105 - .../resources/org/apache/tika/language/it.test | 109 - .../tika/language/langbuilder/welsh_corpus.txt | 2602 ------------------ .../resources/org/apache/tika/language/lt.test | 32 - .../resources/org/apache/tika/language/nl.test | 105 - .../resources/org/apache/tika/language/pt.test | 105 - .../resources/org/apache/tika/language/sv.test | 108 - .../java/org/apache/tika/example/Language.java | 32 +- .../tika/example/LanguageDetectingParser.java | 15 +- .../tika/example/LanguageDetectorExample.java | 33 + .../tika/example/LanguageIdentifierExample.java | 27 - .../org/apache/tika/example/MyFirstTika.java | 14 +- .../example/LanguageDetectorExampleTest.java | 39 + .../example/LanguageIdentifierExampleTest.java | 37 - .../tika/langdetect/LanguageConfidence.java | 3 +- .../tika/langdetect/LanguageDetector.java | 7 +- .../apache/tika/langdetect/LanguageHandler.java | 14 + .../apache/tika/langdetect/LanguageResult.java | 7 + .../apache/tika/langdetect/LanguageWriter.java | 9 + .../tika/langdetect/OptimaizeLangDetector.java | 10 +- .../langdetect/OptimaizeLangDetectorTest.java | 4 +- .../tika/server/resource/LanguageResource.java | 27 +- .../tika/server/resource/MetadataResource.java | 9 +- .../resource/RecursiveMetadataResource.java | 7 +- .../tika/server/resource/TranslateResource.java | 23 +- .../language/translate/AbstractTranslator.java | 16 + .../language/translate/CachedTranslator.java | 20 +- .../language/translate/ExternalTranslator.java | 13 +- .../language/translate/GoogleTranslator.java | 20 +- .../language/translate/Lingo24Translator.java | 20 +- .../language/translate/MosesTranslator.java | 7 +- 50 files changed, 243 insertions(+), 5946 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java ---------------------------------------------------------------------- diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 4458526..3efe0f7 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -18,11 +18,6 @@ package org.apache.tika.cli; import static java.nio.charset.StandardCharsets.UTF_8; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.TransformerConfigurationException; -import javax.xml.transform.sax.SAXTransformerFactory; -import javax.xml.transform.sax.TransformerHandler; -import javax.xml.transform.stream.StreamResult; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; @@ -57,6 +52,12 @@ import java.util.Map.Entry; import java.util.Set; import java.util.TreeSet; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TransformerHandler; +import javax.xml.transform.stream.StreamResult; + import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; @@ -82,8 +83,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.fork.ForkParser; import org.apache.tika.gui.TikaGUI; import org.apache.tika.io.TikaInputStream; -import org.apache.tika.language.LanguageProfilerBuilder; -import org.apache.tika.language.ProfilingHandler; +import org.apache.tika.langdetect.LanguageHandler; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.serialization.JsonMetadata; import org.apache.tika.metadata.serialization.JsonMetadataList; @@ -283,7 +283,7 @@ public class TikaCLI { OutputStream output, Metadata metadata) throws Exception { final PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding)); - return new ProfilingHandler() { + return new LanguageHandler() { public void endDocument() { writer.println(getLanguage().getLanguage()); writer.flush(); @@ -305,22 +305,6 @@ public class TikaCLI { }; - /* Creates ngram profile */ - private final OutputType CREATE_PROFILE = new OutputType() { - @Override - public void process( - InputStream stream, OutputStream output, Metadata metadata) - throws Exception { - ngp = LanguageProfilerBuilder.create(profileName, stream, encoding); - FileOutputStream fos = new FileOutputStream(new File(profileName + ".ngp")); - ngp.save(fos);//saves ngram profile - fos.close(); - PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding)); - writer.println("ngram profile location:=" + new File(ngp.getName()).getCanonicalPath()); - writer.flush(); - } - }; - private ParseContext context; private Detector detector; @@ -335,8 +319,6 @@ public class TikaCLI { private boolean recursiveJSON = false; - private LanguageProfilerBuilder ngp = null; - /** * Output character encoding, or <code>null</code> for platform default */ @@ -355,8 +337,6 @@ public class TikaCLI { private boolean fork = false; - private String profileName = null; - private boolean prettyPrint; public TikaCLI() throws Exception { @@ -474,9 +454,6 @@ public class TikaCLI { } else if (arg.startsWith("--client=")) { URI uri = new URI(arg.substring("--client=".length())); parser = new NetworkParser(uri); - } else if(arg.startsWith("--create-profile=")){ - profileName = arg.substring("--create-profile=".length()); - type = CREATE_PROFILE; } else { pipeMode = false; if (serverMode) { @@ -586,8 +563,6 @@ public class TikaCLI { out.println(" -r or --pretty-print For JSON, XML and XHTML outputs, adds newlines and"); out.println(" whitespace, for better readability"); out.println(); - out.println(" --create-profile=X"); - out.println(" Create NGram profile, where X is a profile name"); out.println(" --list-parsers"); out.println(" List the available document parsers"); out.println(" --list-parser-details"); http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index 9fc8ee8..4a68475 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -54,18 +54,6 @@ public class TikaCLITest { } /** - * Creates a welsh language profile - * - * @throws Exception - */ - @Test - public void testCreateProfile() throws Exception { - String[] params = {"--create-profile=welsh", "-eUTF-8", resourcePrefix + "welsh_corpus.txt"}; - TikaCLI.main(params); - assertTrue(profile.exists()); - } - - /** * Tests --list-parser-detail option of the cli * * @throws Exception http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java b/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java deleted file mode 100644 index 00f6d06..0000000 --- a/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.language; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.HashMap; -import java.util.Map; -import java.util.Properties; -import java.util.Set; - -import static java.nio.charset.StandardCharsets.UTF_8; - -/** - * Identifier of the language that best matches a given content profile. - * The content profile is compared to generic language profiles based on - * material from various sources. - * - * @since Apache Tika 0.5 - * @see <a href="http://www.iccs.inf.ed.ac.uk/~pkoehn/publications/europarl/"> - * Europarl: A Parallel Corpus for Statistical Machine Translation</a> - * @see <a href="http://www.loc.gov/standards/iso639-2/php/code_list.php"> - * ISO 639 Language Codes</a> - */ -public class LanguageIdentifier { - - /** - * The available language profiles. - */ - private static final Map<String, LanguageProfile> PROFILES = - new HashMap<String, LanguageProfile>(); - private static final String PROFILE_SUFFIX = ".ngp"; - - private static Properties props = new Properties(); - private static String errors = ""; - - private static final String PROPERTIES_OVERRIDE_FILE = "tika.language.override.properties"; - private static final String PROPERTIES_FILE = "tika.language.properties"; - private static final String LANGUAGES_KEY = "languages"; - private static final double CERTAINTY_LIMIT = 0.022; - - private final String language; - - private final double distance; - - /* - * Always attempt initializing language profiles when class is loaded first time - */ - static { - initProfiles(); - } - - /* - * Add one language profile based on config in property file - */ - private static void addProfile(String language) throws Exception { - try { - LanguageProfile profile = new LanguageProfile(); - - try (InputStream stream = - LanguageIdentifier.class.getResourceAsStream( - language + PROFILE_SUFFIX)) { - BufferedReader reader = - new BufferedReader(new InputStreamReader(stream, UTF_8)); - String line = reader.readLine(); - while (line != null) { - if (line.length() > 0 && !line.startsWith("#")) { - int space = line.indexOf(' '); - profile.add( - line.substring(0, space), - Long.parseLong(line.substring(space + 1))); - } - line = reader.readLine(); - } - } - - addProfile(language, profile); - } catch (Throwable t) { - throw new Exception("Failed trying to load language profile for language \""+language+"\". Error: "+t.getMessage()); - } - } - - /** - * Adds a single language profile - * @param language an ISO 639 code representing language - * @param profile the language profile - */ - public static void addProfile(String language, LanguageProfile profile) { - PROFILES.put(language, profile); - } - - /** - * Constructs a language identifier based on a LanguageProfile - * @param profile the language profile - */ - public LanguageIdentifier(LanguageProfile profile) { - String minLanguage = "unknown"; - double minDistance = 1.0; - for (Map.Entry<String, LanguageProfile> entry : PROFILES.entrySet()) { - double distance = profile.distance(entry.getValue()); - if (distance < minDistance) { - minDistance = distance; - minLanguage = entry.getKey(); - } - } - - this.language = minLanguage; - this.distance = minDistance; - } - - /** - * Constructs a language identifier based on a String of text content - * @param content the text - */ - public LanguageIdentifier(String content) { - this(new LanguageProfile(content)); - } - - /** - * Gets the identified language - * @return an ISO 639 code representing the detected language - */ - public String getLanguage() { - return language; - } - - /** - * Tries to judge whether the identification is certain enough - * to be trusted. - * WARNING: Will never return true for small amount of input texts. - * @return <code>true</code> if the distance is smaller then {@value #CERTAINTY_LIMIT}, <code>false</code> otherwise - */ - public boolean isReasonablyCertain() { - return distance < CERTAINTY_LIMIT; - } - - /** - * Builds the language profiles. - * The list of languages are fetched from a property file named "tika.language.properties" - * If a file called "tika.language.override.properties" is found on classpath, this is used instead - * The property file contains a key "languages" with values being comma-separated language codes - */ - public static void initProfiles() { - clearProfiles(); - - errors = ""; - InputStream stream; - stream = LanguageIdentifier.class.getResourceAsStream(PROPERTIES_OVERRIDE_FILE); - if(stream == null) { - stream = LanguageIdentifier.class.getResourceAsStream(PROPERTIES_FILE); - } - - if(stream != null){ - try { - props = new Properties(); - props.load(stream); - } catch (IOException e) { - errors += "IOException while trying to load property file. Message: " + e.getMessage() + "\n"; - } - } - - String[] languages = props.getProperty(LANGUAGES_KEY).split(","); - for(String language : languages) { - language = language.trim(); - String name = props.getProperty("name."+language, "Unknown"); - try { - addProfile(language); - } catch (Exception e) { - errors += "Language " + language + " (" + name + ") not initialized. Message: " + e.getMessage() + "\n"; - } - } - } - - /** - * Initializes the language profiles from a user supplied initialized Map. - * This overrides the default set of profiles initialized at startup, - * and provides an alternative to configuring profiles through property file - * - * @param profilesMap map of language profiles - */ - public static void initProfiles(Map<String, LanguageProfile> profilesMap) { - clearProfiles(); - for(Map.Entry<String, LanguageProfile> entry : profilesMap.entrySet()) { - addProfile(entry.getKey(), entry.getValue()); - } - } - - /** - * Clears the current map of language profiles - */ - public static void clearProfiles() { - PROFILES.clear(); - } - - /** - * Tests whether there were errors initializing language config - * @return true if there are errors. Use getErrors() to retrieve. - */ - public static boolean hasErrors() { - return errors != ""; - } - - /** - * Returns a string of error messages related to initializing langauge profiles - * @return the String containing the error messages - */ - public static String getErrors() { - return errors; - } - - /** - * Returns what languages are supported for language identification - * @return A set of Strings being the ISO 639 language codes - */ - public static Set<String> getSupportedLanguages() { - return PROFILES.keySet(); - } - - @Override - public String toString() { - return language + " (" + distance + ")"; - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java b/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java deleted file mode 100644 index 9442920..0000000 --- a/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.language; - - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; -import java.util.List; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; - -/** - * Language profile based on ngram counts. - * - * @since Apache Tika 0.5 - */ -public class LanguageProfile { - - public static final int DEFAULT_NGRAM_LENGTH = 3; - - private final int length; - - /** - * The ngrams that make up this profile. - */ - private final Map<String, Counter> ngrams = - new HashMap<String, Counter>(); - - /** - * Sorted ngram cache for faster distance calculation. - */ - private Interleaved interleaved = new Interleaved(); - public static boolean useInterleaved = true; // For testing purposes - - /** - * The sum of all ngram counts in this profile. - * Used to calculate relative ngram frequency. - */ - private long count = 0; - - private static class Counter { - private long count = 0; - public String toString() { - return Long.toString(count); - } - } - - public LanguageProfile(int length) { - this.length = length; - } - - public LanguageProfile() { - this(DEFAULT_NGRAM_LENGTH); - } - - public LanguageProfile(String content, int length) { - this(length); - - ProfilingWriter writer = new ProfilingWriter(this); - char[] ch = content.toCharArray(); - writer.write(ch, 0, ch.length); - } - - public LanguageProfile(String content) { - this(content, DEFAULT_NGRAM_LENGTH); - } - - public long getCount() { - return count; - } - - public long getCount(String ngram) { - Counter counter = ngrams.get(ngram); - if (counter != null) { - return counter.count; - } else { - return 0; - } - } - - /** - * Adds a single occurrence of the given ngram to this profile. - * - * @param ngram the ngram - */ - public void add(String ngram) { - add(ngram, 1); - } - - /** - * Adds multiple occurrences of the given ngram to this profile. - * - * @param ngram the ngram - * @param count number of occurrences to add - */ - public void add(String ngram, long count) { - if (length != ngram.length()) { - throw new IllegalArgumentException( - "Unable to add an ngram of incorrect length: " - + ngram.length() + " != " + length); - } - - Counter counter = ngrams.get(ngram); - if (counter == null) { - counter = new Counter(); - ngrams.put(ngram, counter); - } - counter.count += count; - this.count += count; - } - - /** - * Calculates the geometric distance between this and the given - * other language profile. - * - * @param that the other language profile - * @return distance between the profiles - */ - public double distance(LanguageProfile that) { - return useInterleaved ? distanceInterleaved(that) : distanceStandard(that); - } - - private double distanceStandard(LanguageProfile that) { - if (length != that.length) { - throw new IllegalArgumentException( - "Unable to calculage distance of language profiles" - + " with different ngram lengths: " - + that.length + " != " + length); - } - - double sumOfSquares = 0.0; - double thisCount = Math.max(this.count, 1.0); - double thatCount = Math.max(that.count, 1.0); - - Set<String> ngrams = new HashSet<String>(); - ngrams.addAll(this.ngrams.keySet()); - ngrams.addAll(that.ngrams.keySet()); - for (String ngram : ngrams) { - double thisFrequency = this.getCount(ngram) / thisCount; - double thatFrequency = that.getCount(ngram) / thatCount; - double difference = thisFrequency - thatFrequency; - sumOfSquares += difference * difference; - } - - return Math.sqrt(sumOfSquares); - } - - @Override - public String toString() { - return ngrams.toString(); - } - - /* Code for interleaved distance calculation below */ - - private double distanceInterleaved(LanguageProfile that) { - if (length != that.length) { - throw new IllegalArgumentException( - "Unable to calculage distance of language profiles" - + " with different ngram lengths: " - + that.length + " != " + length); - } - - double sumOfSquares = 0.0; - double thisCount = Math.max(this.count, 1.0); - double thatCount = Math.max(that.count, 1.0); - - Interleaved.Entry thisEntry = updateInterleaved().firstEntry(); - Interleaved.Entry thatEntry = that.updateInterleaved().firstEntry(); - - // Iterate the lists in parallel, until both lists has been depleted - while (thisEntry.hasNgram() || thatEntry.hasNgram()) { - if (!thisEntry.hasNgram()) { // Depleted this - sumOfSquares += square(thatEntry.count / thatCount); - thatEntry.next(); - continue; - } - - if (!thatEntry.hasNgram()) { // Depleted that - sumOfSquares += square(thisEntry.count / thisCount); - thisEntry.next(); - continue; - } - - final int compare = thisEntry.compareTo(thatEntry); - - if (compare == 0) { // Term exists both in this and that - double difference = thisEntry.count/thisCount - thatEntry.count/thatCount; - sumOfSquares += square(difference); - thisEntry.next(); - thatEntry.next(); - } else if (compare < 0) { // Term exists only in this - sumOfSquares += square(thisEntry.count/thisCount); - thisEntry.next(); - } else { // Term exists only in that - sumOfSquares += square(thatEntry.count/thatCount); - thatEntry.next(); - } - } - return Math.sqrt(sumOfSquares); - } - private double square(double count) { - return count * count; - } - - private class Interleaved { - - private char[] entries = null; // <ngram(length chars)><count(2 chars)>* - private int size = 0; // Number of entries (one entry = length+2 chars) - private long entriesGeneratedAtCount = -1; // Keeps track of when the sequential structure was current - - /** - * Ensure that the entries array is in sync with the ngrams. - */ - public void update() { - if (count == entriesGeneratedAtCount) { // Already up to date - return; - } - size = ngrams.size(); - final int numChars = (length+2)*size; - if (entries == null || entries.length < numChars) { - entries = new char[numChars]; - } - int pos = 0; - for (Map.Entry<String, Counter> entry: getSortedNgrams()) { - for (int l = 0 ; l < length ; l++) { - entries[pos + l] = entry.getKey().charAt(l); - } - entries[pos + length] = (char)(entry.getValue().count / 65536); // Upper 16 bit - entries[pos + length + 1] = (char)(entry.getValue().count % 65536); // lower 16 bit - pos += length + 2; - } - entriesGeneratedAtCount = count; - } - - public Entry firstEntry() { - Entry entry = new Entry(); - if (size > 0) { - entry.update(0); - } - return entry; - } - - private List<Map.Entry<String, Counter>> getSortedNgrams() { - List<Map.Entry<String, Counter>> entries = new ArrayList<Map.Entry<String, Counter>>(ngrams.size()); - entries.addAll(ngrams.entrySet()); - Collections.sort(entries, new Comparator<Map.Entry<String, Counter>>() { - @Override - public int compare(Map.Entry<String, Counter> o1, Map.Entry<String, Counter> o2) { - return o1.getKey().compareTo(o2.getKey()); - } - }); - return entries; - } - - private class Entry implements Comparable<Entry> { - char[] ngram = new char[length]; - int count = 0; - int pos = 0; - - private void update(int pos) { - this.pos = pos; - if (pos >= size) { // Reached the end - return; - } - final int origo = pos*(length+2); - System.arraycopy(entries, origo, ngram, 0, length); - count = entries[origo+length] * 65536 + entries[origo+length+1]; - } - - @Override - public int compareTo(Entry other) { - for (int i = 0 ; i < ngram.length ; i++) { - if (ngram[i] != other.ngram[i]) { - return ngram[i] - other.ngram[i]; - } - } - return 0; - } - public boolean hasNext() { - return pos < size-1; - } - public boolean hasNgram() { - return pos < size; - } - public void next() { - update(pos+1); - } - public String toString() { - return new String(ngram) + "(" + count + ")"; - } - } - } - private Interleaved updateInterleaved() { - interleaved.update(); - return interleaved; - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java b/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java deleted file mode 100644 index bac1f97..0000000 --- a/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java +++ /dev/null @@ -1,770 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.language; - -// JDK imports -import java.io.BufferedInputStream; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.OutputStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -import org.apache.tika.exception.TikaException; - -import static java.nio.charset.StandardCharsets.UTF_8; - -/** - * This class runs a ngram analysis over submitted text, results might be used - * for automatic language identification. - * - * The similarity calculation is at experimental level. You have been warned. - * - * Methods are provided to build new NGramProfiles profiles. - * - * @author Sami Siren - * @author Jerome Charron - http://frutch.free.fr/ - */ -public class LanguageProfilerBuilder { - - // public static final Log LOG = - // LogFactory.getLog(LanguageProfilerBuilder.class); - - /** The minimum length allowed for a ngram. */ - final static int ABSOLUTE_MIN_NGRAM_LENGTH = 3; /* was 1 */ - - /** The maximum length allowed for a ngram. */ - final static int ABSOLUTE_MAX_NGRAM_LENGTH = 3; /* was 4 */ - - /** The default min length of ngram */ - final static int DEFAULT_MIN_NGRAM_LENGTH = 3; - - /** The default max length of ngram */ - final static int DEFAULT_MAX_NGRAM_LENGTH = 3; - - /** The ngram profile file extension */ - final static String FILE_EXTENSION = "ngp"; - - /** The profile max size (number of ngrams of the same size) */ - final static int MAX_SIZE = 1000; - - /** separator char */ - final static char SEPARATOR = '_'; - /** The String form of the separator char */ - private final static String SEP_CHARSEQ = new String( - new char[] { SEPARATOR }); - - /** The profile's name */ - private String name = null; - - /** The NGrams of this profile sorted on the number of occurrences */ - private List<NGramEntry> sorted = null; - - /** The min length of ngram */ - private int minLength = DEFAULT_MIN_NGRAM_LENGTH; - - /** The max length of ngram */ - private int maxLength = DEFAULT_MAX_NGRAM_LENGTH; - - /** The total number of ngrams occurences */ - private int[] ngramcounts = null; - - /** An index of the ngrams of the profile */ - private Map<CharSequence, NGramEntry> ngrams = null; - - /** A StringBuffer used during analysis */ - private QuickStringBuffer word = new QuickStringBuffer(); - - /** - * Constructs a new ngram profile - * - * @param name is the name of the profile - * @param minlen is the min length of ngram sequences - * @param maxlen is the max length of ngram sequences - */ - public LanguageProfilerBuilder(String name, int minlen, int maxlen) { - // TODO: Compute the initial capacity using minlen and maxlen. - this.ngrams = new HashMap<CharSequence, NGramEntry>(4000); - this.minLength = minlen; - this.maxLength = maxlen; - this.name = name; - } - - /** - * Constructs a new ngram profile where minlen=3, maxlen=3 - * - * @param name is a name of profile, usually two length string - * @since Tika 1.0 - */ - public LanguageProfilerBuilder(String name) { - this.ngrams = new HashMap<CharSequence, NGramEntry>(4000); - this.minLength = ABSOLUTE_MIN_NGRAM_LENGTH; - this.maxLength = ABSOLUTE_MAX_NGRAM_LENGTH; - this.name = name; - } - - /** - * @return Returns the name. - */ - public String getName() { - return name; - } - - // This method was commented because it depends on org.apache.lucene.analysis.Token - // that is not a part of the Tika - // /** - // * Adds ngrams from a token to this profile - // * - // * @param t is the Token to be added - // */ - // public void add(Token t) { - // add(new StringBuffer().append(SEPARATOR) - // .append(t.term()) - // .append(SEPARATOR)); - // } - - /** - * Adds ngrams from a single word to this profile - * - * @param word is the word to add - */ - public void add(StringBuffer word) { - for (int i = minLength; (i <= maxLength) && (i < word.length()); i++) { - add(word, i); - } - } - - /** - * Adds the last NGrams from the specified word. - */ - private void add(QuickStringBuffer word) { - int wlen = word.length(); - if (wlen >= minLength) { - int max = Math.min(maxLength, wlen); - for (int i = minLength; i <= max; i++) { - add(word.subSequence(wlen - i, wlen)); - } - } - } - - /** - * Adds ngrams from a single word in this profile - * - * @param word is the word to add - * @param n is the ngram size - */ - private void add(CharSequence cs) { - - if (cs.equals(SEP_CHARSEQ)) { - return; - } - NGramEntry nge = ngrams.get(cs); - if (nge == null) { - nge = new NGramEntry(cs); - ngrams.put(cs, nge); - } - nge.inc(); - } - - /** - * Analyzes a piece of text - * - * @param text - * the text to be analyzed - */ - public void analyze(StringBuilder text) { - - if (ngrams != null) { - ngrams.clear(); - sorted = null; - ngramcounts = null; - } - - word.clear().append(SEPARATOR); - for (int i = 0; i < text.length(); i++) { - char c = Character.toLowerCase(text.charAt(i)); - - if (Character.isLetter(c)) { - add(word.append(c)); - } else { - // found word boundary - if (word.length() > 1) { - // we have a word! - add(word.append(SEPARATOR)); - word.clear().append(SEPARATOR); - } - } - } - - if (word.length() > 1) { - // we have a word! - add(word.append(SEPARATOR)); - } - normalize(); - } - - /** - * @param word - * @param n sequence length - */ - private void add(StringBuffer word, int n) { - for (int i = 0; i <= word.length() - n; i++) { - add(word.subSequence(i, i + n)); - } - } - - /** - * Normalizes the profile (calculates the ngrams frequencies) - */ - protected void normalize() { - NGramEntry e = null; - Iterator<NGramEntry> i = ngrams.values().iterator(); - - // Calculates ngram count if not already done - if (ngramcounts == null) { - ngramcounts = new int[maxLength + 1]; - while (i.hasNext()) { - e = i.next(); - ngramcounts[e.size()] += e.count; - } - } - - i = ngrams.values().iterator(); - while (i.hasNext()) { - e = i.next(); - e.frequency = (float) e.count / (float) ngramcounts[e.size()]; - } - } - - /** - * Returns a sorted list of ngrams (sort done by 1. frequency 2. sequence) - * - * @return sorted vector of ngrams - */ - public List<NGramEntry> getSorted() { - // make sure sorting is done only once - if (sorted == null) { - sorted = new ArrayList<NGramEntry>(ngrams.values()); - Collections.sort(sorted); - - // trim at NGRAM_LENGTH entries - if (sorted.size() > MAX_SIZE) { - sorted = sorted.subList(0, MAX_SIZE); - } - } - return sorted; - } - - // Inherited JavaDoc - public String toString() { - - StringBuffer s = new StringBuffer().append("NGramProfile: ") - .append(name).append("\n"); - - Iterator<NGramEntry> i = getSorted().iterator(); - - while (i.hasNext()) { - NGramEntry entry = i.next(); - s.append("[").append(entry.seq).append("/").append(entry.count) - .append("/").append(entry.frequency).append("]\n"); - } - return s.toString(); - } - - /** - * Calculates a score how well NGramProfiles match each other - * - * @param another - * ngram profile to compare against - * @return similarity 0=exact match - * @throws TikaException - * if could not calculate a score - */ - public float getSimilarity(LanguageProfilerBuilder another) - throws TikaException { - - float sum = 0; - - try { - Iterator<NGramEntry> i = another.getSorted().iterator(); - while (i.hasNext()) { - NGramEntry other = i.next(); - if (ngrams.containsKey(other.seq)) { - sum += Math.abs((other.frequency - ngrams.get(other.seq).frequency)) / 2; - } else { - sum += other.frequency; - } - } - i = getSorted().iterator(); - while (i.hasNext()) { - NGramEntry other = i.next(); - if (another.ngrams.containsKey(other.seq)) { - sum += Math.abs((other.frequency - another.ngrams - .get(other.seq).frequency)) / 2; - } else { - sum += other.frequency; - } - } - } catch (Exception e) { - throw new TikaException("Could not calculate a score how well NGramProfiles match each other"); - } - return sum; - } - - /** - * Loads a ngram profile from an InputStream (assumes UTF-8 encoded content) - * - * @param is the InputStream to read - */ - public void load(InputStream is) throws IOException { - - ngrams.clear(); - ngramcounts = new int[maxLength + 1]; - BufferedReader reader = new BufferedReader(new InputStreamReader(is, UTF_8)); - String line = null; - - while ((line = reader.readLine()) != null) { - - // # starts a comment line - if (line.charAt(0) != '#') { - int spacepos = line.indexOf(' '); - String ngramsequence = line.substring(0, spacepos).trim(); - int len = ngramsequence.length(); - if ((len >= minLength) && (len <= maxLength)) { - int ngramcount = Integer.parseInt(line.substring(spacepos + 1)); - NGramEntry en = new NGramEntry(ngramsequence, ngramcount); - ngrams.put(en.getSeq(), en); - ngramcounts[len] += ngramcount; - } - } - } - normalize(); - } - - /** - * Creates a new Language profile from (preferably quite large - 5-10k of - * lines) text file - * - * @param name to be given for the profile - * @param is a stream to be read - * @param encoding is the encoding of stream - * - * @throws TikaException if could not create a language profile - * - */ - public static LanguageProfilerBuilder create(String name, InputStream is, String encoding) throws TikaException { - - LanguageProfilerBuilder newProfile = new LanguageProfilerBuilder(name, - ABSOLUTE_MIN_NGRAM_LENGTH, ABSOLUTE_MAX_NGRAM_LENGTH); - BufferedInputStream bis = new BufferedInputStream(is); - - byte buffer[] = new byte[4096]; - StringBuilder text = new StringBuilder(); - int len; - - try { - while ((len = bis.read(buffer)) != -1) { - text.append(new String(buffer, 0, len, encoding)); - } - } catch (IOException e) { - throw new TikaException("Could not create profile, " + e.getMessage()); - } - - newProfile.analyze(text); - return newProfile; - } - - /** - * Writes NGramProfile content into OutputStream, content is outputted with - * UTF-8 encoding - * - * @param os the Stream to output to - * - * @throws IOException - */ - public void save(OutputStream os) throws IOException { - os.write(("# NgramProfile generated at " + new Date() + - " for Apache Tika Language Identification\n").getBytes(UTF_8)); - - // And then each ngram - - // First dispatch ngrams in many lists depending on their size - // (one list for each size, in order to store MAX_SIZE ngrams for each - // size of ngram) - List<NGramEntry> list = new ArrayList<NGramEntry>(); - List<NGramEntry> sublist = new ArrayList<NGramEntry>(); - NGramEntry[] entries = ngrams.values().toArray( - new NGramEntry[ngrams.size()]); - for (int i = minLength; i <= maxLength; i++) { - for (int j = 0; j < entries.length; j++) { - if (entries[j].getSeq().length() == i) { - sublist.add(entries[j]); - } - } - Collections.sort(sublist); - if (sublist.size() > MAX_SIZE) { - sublist = sublist.subList(0, MAX_SIZE); - } - list.addAll(sublist); - sublist.clear(); - } - for (int i = 0; i < list.size(); i++) { - NGramEntry e = list.get(i); - String line = e.toString() + " " + e.getCount() + "\n"; - os.write(line.getBytes(UTF_8)); - } - os.flush(); - } - - /** - * main method used for testing only - * - * @param args - */ - public static void main(String args[]) { - - // -create he sample_he.txt utf-8 - - String usage = "Usage: NGramProfile " - + "[-create profilename filename encoding] " - + "[-similarity file1 file2] " - + "[-score profile-name filename encoding]"; - int command = 0; - - final int CREATE = 1; - final int SIMILARITY = 2; - final int SCORE = 3; - - String profilename = ""; - String filename = ""; - String filename2 = ""; - String encoding = ""; - - if (args.length == 0) { - System.err.println(usage); - System.exit(-1); - } - - for (int i = 0; i < args.length; i++) { // parse command line - if (args[i].equals("-create")) { // found -create option - command = CREATE; - profilename = args[++i]; - filename = args[++i]; - encoding = args[++i]; - } - - if (args[i].equals("-similarity")) { // found -similarity option - command = SIMILARITY; - filename = args[++i]; - filename2 = args[++i]; - encoding = args[++i]; - } - - if (args[i].equals("-score")) { // found -Score option - command = SCORE; - profilename = args[++i]; - filename = args[++i]; - encoding = args[++i]; - } - } - - try { - - switch (command) { - - case CREATE: - - File f = new File(filename); - FileInputStream fis = new FileInputStream(f); - LanguageProfilerBuilder newProfile = LanguageProfilerBuilder - .create(profilename, fis, encoding); - fis.close(); - f = new File(profilename + "." + FILE_EXTENSION); - FileOutputStream fos = new FileOutputStream(f); - newProfile.save(fos); - System.out.println("new profile " + profilename + "." - + FILE_EXTENSION + " was created."); - break; - - case SIMILARITY: - - f = new File(filename); - fis = new FileInputStream(f); - newProfile = LanguageProfilerBuilder.create(filename, fis, - encoding); - newProfile.normalize(); - - f = new File(filename2); - fis = new FileInputStream(f); - LanguageProfilerBuilder newProfile2 = LanguageProfilerBuilder - .create(filename2, fis, encoding); - newProfile2.normalize(); - System.out.println("Similarity is " - + newProfile.getSimilarity(newProfile2)); - break; - - case SCORE: - f = new File(filename); - fis = new FileInputStream(f); - newProfile = LanguageProfilerBuilder.create(filename, fis, - encoding); - - f = new File(profilename + "." + FILE_EXTENSION); - fis = new FileInputStream(f); - LanguageProfilerBuilder compare = new LanguageProfilerBuilder( - profilename, DEFAULT_MIN_NGRAM_LENGTH, - DEFAULT_MAX_NGRAM_LENGTH); - compare.load(fis); - System.out.println("Score is " - + compare.getSimilarity(newProfile)); - break; - - } - - } catch (Exception e) { - e.printStackTrace(); - // throw new TikaException(""); - } - } - - - /** - * Inner class that describes a NGram - */ - static class NGramEntry implements Comparable<NGramEntry> { - - /** The NGRamProfile this NGram is related to */ - private LanguageProfilerBuilder profile = null; - - /** The sequence of characters of the ngram */ - CharSequence seq = null; - - /** The number of occurences of this ngram in its profile */ - private int count = 0; - - /** The frequency of this ngram in its profile */ - private float frequency = 0.0F; - - /** - * Constructs a new NGramEntry - * - * @param seq is the sequence of characters of the ngram - */ - public NGramEntry(CharSequence seq) { - this.seq = seq; - } - - /** - * Constructs a new NGramEntry - * - * @param seq is the sequence of characters of the ngram - * @param count is the number of occurrences of this ngram - */ - public NGramEntry(String seq, int count) { - this.seq = new StringBuffer(seq).subSequence(0, seq.length()); - this.count = count; - } - - /** - * Returns the number of occurrences of this ngram in its profile - * - * @return the number of occurrences of this ngram in its profile - */ - public int getCount() { - return count; - } - - /** - * Returns the frequency of this ngram in its profile - * - * @return the frequency of this ngram in its profile - */ - public float getFrequency() { - return frequency; - } - - /** - * Returns the sequence of characters of this ngram - * - * @return the sequence of characters of this ngram - */ - public CharSequence getSeq() { - return seq; - } - - /** - * Returns the size of this ngram - * - * @return the size of this ngram - */ - public int size() { - return seq.length(); - } - - // Inherited JavaDoc - public int compareTo(NGramEntry ngram) { - int diff = Float.compare(ngram.getFrequency(), frequency); - if (diff != 0) { - return diff; - } else { - return (toString().compareTo(ngram.toString())); - } - } - - /** - * Increments the number of occurrences of this ngram. - */ - public void inc() { - count++; - } - - /** - * Associated a profile to this ngram - * - * @param profile - * is the profile associated to this ngram - */ - public void setProfile(LanguageProfilerBuilder profile) { - this.profile = profile; - } - - /** - * Returns the profile associated to this ngram - * - * @return the profile associated to this ngram - */ - public LanguageProfilerBuilder getProfile() { - return profile; - } - - // Inherited JavaDoc - public String toString() { - return seq.toString(); - } - - // Inherited JavaDoc - public int hashCode() { - return seq.hashCode(); - } - - // Inherited JavaDoc - public boolean equals(Object obj) { - - NGramEntry ngram = null; - try { - ngram = (NGramEntry) obj; - return ngram.seq.equals(seq); - } catch (Exception e) { - return false; - } - } - - } - - private static class QuickStringBuffer implements CharSequence { - - private char value[]; - - private int count; - - QuickStringBuffer() { - this(16); - } - - QuickStringBuffer(char[] value) { - this.value = value; - count = value.length; - } - - QuickStringBuffer(int length) { - value = new char[length]; - } - - QuickStringBuffer(String str) { - this(str.length() + 16); - append(str); - } - - public int length() { - return count; - } - - private void expandCapacity(int minimumCapacity) { - int newCapacity = (value.length + 1) * 2; - if (newCapacity < 0) { - newCapacity = Integer.MAX_VALUE; - } else if (minimumCapacity > newCapacity) { - newCapacity = minimumCapacity; - } - - char newValue[] = new char[newCapacity]; - System.arraycopy(value, 0, newValue, 0, count); - value = newValue; - } - - QuickStringBuffer clear() { - count = 0; - return this; - } - - public char charAt(int index) { - return value[index]; - } - - QuickStringBuffer append(String str) { - if (str == null) { - str = String.valueOf(str); - } - - int len = str.length(); - int newcount = count + len; - if (newcount > value.length) { - expandCapacity(newcount); - } - str.getChars(0, len, value, count); - count = newcount; - return this; - } - - QuickStringBuffer append(char c) { - int newcount = count + 1; - if (newcount > value.length) { - expandCapacity(newcount); - } - value[count++] = c; - return this; - } - - public CharSequence subSequence(int start, int end) { - return new String(value, start, end - start); - } - - public String toString() { - return new String(this.value); - } - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java b/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java deleted file mode 100644 index 5c5b9d2..0000000 --- a/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.language; - -import org.apache.tika.sax.WriteOutContentHandler; - -/** - * SAX content handler that builds a language profile based on all the - * received character content. - * - * @since Apache Tika 0.5 - */ -public class ProfilingHandler extends WriteOutContentHandler { - - private final ProfilingWriter writer; - - public ProfilingHandler(ProfilingWriter writer) { - super(writer); - this.writer = writer; - } - - public ProfilingHandler(LanguageProfile profile) { - this(new ProfilingWriter(profile)); - } - - public ProfilingHandler() { - this(new ProfilingWriter()); - } - - /** - * Returns the language profile being built by this content handler. - * Note that the returned profile gets updated whenever new SAX events - * are received by this content handler. Use the {@link #getLanguage()} - * method to get the language that best matches the current state of - * the profile. - * - * @return language profile - */ - public LanguageProfile getProfile() { - return writer.getProfile(); - } - - /** - * Returns the language that best matches the current state of the - * language profile. - * - * @return language that best matches the current profile - */ - public LanguageIdentifier getLanguage() { - return writer.getLanguage(); - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java b/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java deleted file mode 100644 index 33ce707..0000000 --- a/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.language; - -import java.io.IOException; -import java.io.Writer; - -/** - * Writer that builds a language profile based on all the written content. - * - * @since Apache Tika 0.5 - */ -public class ProfilingWriter extends Writer { - - private final LanguageProfile profile; - - private char[] buffer = new char[] { 0, 0, '_' }; - - private int n = 1; - - public ProfilingWriter(LanguageProfile profile) { - this.profile = profile; - } - - public ProfilingWriter() { - this(new LanguageProfile()); - } - - /** - * Returns the language profile being built by this writer. Note that - * the returned profile gets updated whenever new characters are written. - * Use the {@link #getLanguage()} method to get the language that best - * matches the current state of the profile. - * - * @return language profile - */ - public LanguageProfile getProfile() { - return profile; - } - - /** - * Returns the language that best matches the current state of the - * language profile. - * - * @return language that best matches the current profile - */ - public LanguageIdentifier getLanguage() { - return new LanguageIdentifier(profile); - } - - @Override - public void write(char[] cbuf, int off, int len) { - for (int i = 0; i < len; i++) { - char c = Character.toLowerCase(cbuf[off + i]); - if (Character.isLetter(c)) { - addLetter(c); - } else { - addSeparator(); - } - } - } - - private void addLetter(char c) { - System.arraycopy(buffer, 1, buffer, 0, buffer.length - 1); - buffer[buffer.length - 1] = c; - n++; - if (n >= buffer.length) { - profile.add(new String(buffer)); - } - } - - private void addSeparator() { - addLetter('_'); - n = 1; - } - - @Override - public void close() throws IOException { - addSeparator(); - } - - /** - * Ignored. - */ - @Override - public void flush() { - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/main/java/org/apache/tika/language/package-info.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/language/package-info.java b/tika-core/src/main/java/org/apache/tika/language/package-info.java deleted file mode 100644 index f8dc4bf..0000000 --- a/tika-core/src/main/java/org/apache/tika/language/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Language detection. - */ [email protected]("1.0.0") -package org.apache.tika.language; http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java b/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java deleted file mode 100644 index 0c5834b..0000000 --- a/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.language; - -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Writer; -import java.util.HashMap; -import java.util.Locale; - -import org.apache.tika.io.IOUtils; -import org.junit.Before; -import org.junit.Test; - -/** - * JUnit based test of class {@link LanguageIdentifier}. - * - * @author Sami Siren - * @author Jerome Charron - http://frutch.free.fr/ - */ -public class LanguageIdentifierTest { - - private static final String[] languages = new String[] { - // TODO - currently Estonian and Greek fail these tests. - // Enable when language detection works better. - "da", "de", /* "et", "el", */ "en", "es", "fi", "fr", "it", - "lt", "nl", "pt", "sv" - }; - - @Before - public void setUp() { - LanguageIdentifier.initProfiles(); - } - - @Test - public void testLanguageDetection() throws IOException { - for (String language : languages) { - ProfilingWriter writer = new ProfilingWriter(); - writeTo(language, writer); - LanguageIdentifier identifier = null; - identifier = new LanguageIdentifier(writer.getProfile()); - assertEquals(language, identifier.getLanguage()); - // Lithuanian is detected but isn't reasonably certain: - if (!language.equals("lt")) { - assertTrue(identifier.toString(), identifier.isReasonablyCertain()); - } - } - } - - @Test - public void testClearAddAndInitProfiles() throws IOException { - // Prepare english and german language profiles - ProfilingWriter enWriter = new ProfilingWriter(); - writeTo("en", enWriter); - LanguageProfile enProfile = enWriter.getProfile(); - ProfilingWriter deWriter = new ProfilingWriter(); - writeTo("de", deWriter); - LanguageProfile deProfile = deWriter.getProfile(); - - // Out of the box profiles - LanguageIdentifier identifier = null; - identifier = new LanguageIdentifier(enProfile); - assertEquals("en", identifier.getLanguage()); - assertTrue(identifier.isReasonablyCertain()); - - // No profiles - LanguageIdentifier.clearProfiles(); - identifier = new LanguageIdentifier(enProfile); - assertFalse(identifier.isReasonablyCertain()); - - // Only English profile - LanguageIdentifier.addProfile("en", enProfile); - identifier = new LanguageIdentifier(enProfile); - assertEquals("en", identifier.getLanguage()); - assertTrue(identifier.isReasonablyCertain()); - - // English and German profiles loaded explicitly from initProfiles method - HashMap<String, LanguageProfile> profilesMap = new HashMap<String, LanguageProfile>(); - profilesMap.put("en", enProfile); - profilesMap.put("de", deProfile); - LanguageIdentifier.initProfiles(profilesMap); - identifier = new LanguageIdentifier(enProfile); - assertEquals("en", identifier.getLanguage()); - assertTrue(identifier.isReasonablyCertain()); - identifier = new LanguageIdentifier(deProfile); - assertEquals("de", identifier.getLanguage()); - assertTrue(identifier.isReasonablyCertain()); - } - - // Enable this to compare performance - public void testPerformance() throws IOException { - final int MRUNS = 8; - final int IRUNS = 10; - int detected = 0; // To avoid code removal by JVM or compiler - String lastResult = null; - for (int m = 0 ; m < MRUNS ; m++) { - LanguageProfile.useInterleaved = (m & 1) == 1; // Alternate between standard and interleaved - String currentResult = ""; - final long start = System.nanoTime(); - for (int i = 0 ; i < IRUNS ; i++) { - for (String language : languages) { - ProfilingWriter writer = new ProfilingWriter(); - writeTo(language, writer); - LanguageIdentifier identifier = new LanguageIdentifier(writer.getProfile()); - if (identifier.isReasonablyCertain()) { - currentResult += identifier.getLanguage(); - detected++; - } - } - } - System.out.println(String.format(Locale.ROOT, - "Performed %d detections at %2d ms/test with interleaved=%b", - languages.length*IRUNS, (System.nanoTime()-start)/1000000/(languages.length*IRUNS), - LanguageProfile.useInterleaved)); - if (lastResult != null) { // Might as well test that they behave the same while we're at it - assertEquals("This result should be equal to the last", lastResult, currentResult); - } - lastResult = currentResult; - } - if (detected == -1) { - System.out.println("Never encountered but keep it to guard against over-eager optimization"); - } - } - - @Test - public void testMixedLanguages() throws IOException { - for (String language : languages) { - for (String other : languages) { - if (!language.equals(other)) { - if (language.equals("lt") || other.equals("lt")) { - continue; - } - ProfilingWriter writer = new ProfilingWriter(); - writeTo(language, writer); - writeTo(other, writer); - LanguageIdentifier identifier = null; - identifier = new LanguageIdentifier(writer.getProfile()); - assertFalse("mix of " + language + " and " + other + " incorrectly detected as " + identifier, identifier.isReasonablyCertain()); - } - } - } - } - - // TIKA-453: Fix up language identifier used for Estonian - @Test - public void testEstonia() throws Exception { - final String estonian = "et"; - ProfilingWriter writer = new ProfilingWriter(); - writeTo(estonian, writer); - LanguageIdentifier identifier = - new LanguageIdentifier(writer.getProfile()); - assertEquals(estonian, identifier.getLanguage()); - } - - private void writeTo(String language, Writer writer) throws IOException { - try (InputStream stream = - LanguageIdentifierTest.class.getResourceAsStream( - language + ".test")) { - IOUtils.copy(new InputStreamReader(stream, UTF_8), writer); - } - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/test/java/org/apache/tika/language/LanguageProfileTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/language/LanguageProfileTest.java b/tika-core/src/test/java/org/apache/tika/language/LanguageProfileTest.java deleted file mode 100644 index 795eb85..0000000 --- a/tika-core/src/test/java/org/apache/tika/language/LanguageProfileTest.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.language; - -import java.io.IOException; - -import org.junit.Test; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -public class LanguageProfileTest { - - @Test - public void testLanguageProfile() throws IOException { - LanguageProfile foo = new LanguageProfile(); - assertEquals(0, foo.getCount("foo")); - - foo.add("foo"); - assertEquals(1, foo.getCount("foo")); - - foo.add("foo", 3); - assertEquals(4, foo.getCount("foo")); - - LanguageProfile bar = new LanguageProfile(); - assertEquals(1.0, foo.distance(bar), 1e-8); - - bar.add("bar"); - assertEquals(Math.sqrt(2.0), foo.distance(bar), 1e-8); - - bar.add("bar", 3); - assertEquals(Math.sqrt(2.0), foo.distance(bar), 1e-8); - - LanguageProfile foobar = new LanguageProfile(); - assertTrue(foo.distance(foobar) == bar.distance(foobar)); - - foobar.add("foo"); - assertTrue( foo.distance(foobar) < bar.distance(foobar)); - - foobar.add("bar"); - assertTrue(foo.distance(foobar) == bar.distance(foobar)); - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java b/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java deleted file mode 100644 index 39ba686..0000000 --- a/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.language; - -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.URISyntaxException; - -import org.apache.tika.exception.TikaException; -import org.junit.After; -import org.junit.Test; - -public class LanguageProfilerBuilderTest { - /* Test members */ - private LanguageProfilerBuilder ngramProfile = null; - private LanguageProfile langProfile = null; - private final String profileName = "../tika-core/src/test/resources/org/apache/tika/language/langbuilder/" - + LanguageProfilerBuilderTest.class.getName(); - private final String corpusName = "langbuilder/welsh_corpus.txt"; - private final String FILE_EXTENSION = "ngp"; - private final String LANGUAGE = "welsh"; - private final int maxlen = 1000; - - @Test - public void testCreateProfile() throws TikaException, IOException, URISyntaxException { - try (InputStream is = LanguageProfilerBuilderTest.class.getResourceAsStream(corpusName)) { - ngramProfile = LanguageProfilerBuilder.create(profileName, is, UTF_8.name()); - } - - File f = new File(profileName + "." + FILE_EXTENSION); - FileOutputStream fos = new FileOutputStream(f); - ngramProfile.save(fos); - fos.close(); - assertEquals(maxlen, ngramProfile.getSorted().size()); - } - - @Test - public void testNGramProfile() throws IOException, TikaException, URISyntaxException { - createLanguageProfile(); - LanguageIdentifier.addProfile(LANGUAGE, langProfile); - LanguageIdentifier identifier = new LanguageIdentifier(langProfile); - assertEquals(LANGUAGE, identifier.getLanguage()); - assertTrue(identifier.isReasonablyCertain()); - } - - private void createLanguageProfile() throws IOException, TikaException, URISyntaxException { - // Sort of dependency injection - if (ngramProfile == null) - testCreateProfile(); - - langProfile = new LanguageProfile(); - - try (InputStream stream = new FileInputStream(new File(profileName + "." + FILE_EXTENSION))) { - BufferedReader reader = new BufferedReader(new InputStreamReader( - stream, UTF_8)); - String line = reader.readLine(); - while (line != null) { - if (line.length() > 0 && !line.startsWith("#")) {// skips the - // ngp - // header/comment - int space = line.indexOf(' '); - langProfile.add(line.substring(0, space), - Long.parseLong(line.substring(space + 1))); - } - line = reader.readLine(); - } - } - } - - @After - public void tearDown() throws Exception { - File profile = new File(profileName + "." + FILE_EXTENSION); - if (profile.exists()) - profile.delete(); - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/3a7a94ca/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java b/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java deleted file mode 100644 index 5ffcb0f..0000000 --- a/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.language; - -import java.io.IOException; - -import org.junit.Test; - -import static org.junit.Assert.assertEquals; - -public class ProfilingWriterTest { - - @Test - public void testProfilingWriter() throws IOException { - ProfilingWriter writer = new ProfilingWriter(); - writer.write(" foo+BAR FooBar\n"); - writer.close(); - - LanguageProfile profile = writer.getProfile(); - assertEquals(2, profile.getCount("_fo")); - assertEquals(2, profile.getCount("foo")); - assertEquals(1, profile.getCount("oo_")); - assertEquals(1, profile.getCount("oob")); - assertEquals(1, profile.getCount("oba")); - assertEquals(1, profile.getCount("_ba")); - assertEquals(2, profile.getCount("bar")); - assertEquals(2, profile.getCount("ar_")); - } - -}
