Author: mattmann
Date: Sat Feb 14 07:49:54 2015
New Revision: 1659759
URL: http://svn.apache.org/r1659759
Log:
Fix for TIKA-1549 Increased the speed of language identification by a factor of
two. Fix contributed by Toke Eskildsen <[email protected]>. This closes #29.
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1659759&r1=1659758&r2=1659759&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Feb 14 07:49:54 2015
@@ -1,5 +1,8 @@
Release 1.8 - Current Development
+ * Increased the speed of language identification by
+ a factor of two. (TIKA-1549)
+
* Added parser for Sqlite3 db files. Users need to add
org.xerial's sqlite-jdbc to their classpath. Tika is not
currently bundling that dependency. (TIKA-1511)
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java?rev=1659759&r1=1659758&r2=1659759&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java
Sat Feb 14 07:49:54 2015
@@ -16,10 +16,15 @@
*/
package org.apache.tika.language;
+
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
/**
* Language profile based on ngram counts.
@@ -39,6 +44,12 @@ public class LanguageProfile {
new HashMap<String, Counter>();
/**
+ * Sorted ngram cache for faster distance calculation.
+ */
+ private Interleaved interleaved = new Interleaved();
+ public static boolean useInterleaved = true; // For testing purposes
+
+ /**
* The sum of all ngram counts in this profile.
* Used to calculate relative ngram frequency.
*/
@@ -123,6 +134,10 @@ public class LanguageProfile {
* @return distance between the profiles
*/
public double distance(LanguageProfile that) {
+ return useInterleaved ? distanceInterleaved(that) :
distanceStandard(that);
+ }
+
+ private double distanceStandard(LanguageProfile that) {
if (length != that.length) {
throw new IllegalArgumentException(
"Unable to calculage distance of language profiles"
@@ -152,4 +167,148 @@ public class LanguageProfile {
return ngrams.toString();
}
+ /* Code for interleaved distance calculation below */
+
+ private double distanceInterleaved(LanguageProfile that) {
+ if (length != that.length) {
+ throw new IllegalArgumentException(
+ "Unable to calculage distance of language profiles"
+ + " with different ngram lengths: "
+ + that.length + " != " + length);
+ }
+
+ double sumOfSquares = 0.0;
+ double thisCount = Math.max(this.count, 1.0);
+ double thatCount = Math.max(that.count, 1.0);
+
+ Interleaved.Entry thisEntry = updateInterleaved().firstEntry();
+ Interleaved.Entry thatEntry = that.updateInterleaved().firstEntry();
+
+ // Iterate the lists in parallel, until both lists has been depleted
+ while (thisEntry.hasNgram() || thatEntry.hasNgram()) {
+ if (!thisEntry.hasNgram()) { // Depleted this
+ sumOfSquares += square(thatEntry.count / thatCount);
+ thatEntry.next();
+ continue;
+ }
+
+ if (!thatEntry.hasNgram()) { // Depleted that
+ sumOfSquares += square(thisEntry.count / thisCount);
+ thisEntry.next();
+ continue;
+ }
+
+ final int compare = thisEntry.compareTo(thatEntry);
+
+ if (compare == 0) { // Term exists both in this and that
+ double difference = thisEntry.count/thisCount -
thatEntry.count/thatCount;
+ sumOfSquares += square(difference);
+ thisEntry.next();
+ thatEntry.next();
+ } else if (compare < 0) { // Term exists only in this
+ sumOfSquares += square(thisEntry.count/thisCount);
+ thisEntry.next();
+ } else { // Term exists only in that
+ sumOfSquares += square(thatEntry.count/thatCount);
+ thatEntry.next();
+ }
+ }
+ return Math.sqrt(sumOfSquares);
+ }
+ private double square(double count) {
+ return count * count;
+ }
+
+ private class Interleaved {
+
+ private char[] entries = null; // <ngram(length chars)><count(2
chars)>*
+ private int size = 0; // Number of entries (one entry = length+2 chars)
+ private long entriesGeneratedAtCount = -1; // Keeps track of when the
sequential structure was current
+
+ /**
+ * Ensure that the entries array is in sync with the ngrams.
+ */
+ public void update() {
+ if (count == entriesGeneratedAtCount) { // Already up to date
+ return;
+ }
+ size = ngrams.size();
+ final int numChars = (length+2)*size;
+ if (entries == null || entries.length < numChars) {
+ entries = new char[numChars];
+ }
+ int pos = 0;
+ for (Map.Entry<String, Counter> entry: getSortedNgrams()) {
+ for (int l = 0 ; l < length ; l++) {
+ entries[pos + l] = entry.getKey().charAt(l);
+ }
+ entries[pos + length] = (char)(entry.getValue().count /
65536); // Upper 16 bit
+ entries[pos + length + 1] = (char)(entry.getValue().count %
65536); // lower 16 bit
+ pos += length + 2;
+ }
+ entriesGeneratedAtCount = count;
+ }
+
+ public Entry firstEntry() {
+ Entry entry = new Entry();
+ if (size > 0) {
+ entry.update(0);
+ }
+ return entry;
+ }
+
+ private List<Map.Entry<String, Counter>> getSortedNgrams() {
+ List<Map.Entry<String, Counter>> entries = new
ArrayList<Map.Entry<String, Counter>>(ngrams.size());
+ entries.addAll(ngrams.entrySet());
+ Collections.sort(entries, new Comparator<Map.Entry<String,
Counter>>() {
+ @Override
+ public int compare(Map.Entry<String, Counter> o1,
Map.Entry<String, Counter> o2) {
+ return o1.getKey().compareTo(o2.getKey());
+ }
+ });
+ return entries;
+ }
+
+ private class Entry implements Comparable<Entry> {
+ char[] ngram = new char[length];
+ int count = 0;
+ int pos = 0;
+
+ private void update(int pos) {
+ this.pos = pos;
+ if (pos >= size) { // Reached the end
+ return;
+ }
+ final int origo = pos*(length+2);
+ System.arraycopy(entries, origo, ngram, 0, length);
+ count = entries[origo+length] * 65536 +
entries[origo+length+1];
+ }
+
+ @Override
+ public int compareTo(Entry other) {
+ for (int i = 0 ; i < ngram.length ; i++) {
+ if (ngram[i] != other.ngram[i]) {
+ return ngram[i] - other.ngram[i];
+ }
+ }
+ return 0;
+ }
+ public boolean hasNext() {
+ return pos < size-1;
+ }
+ public boolean hasNgram() {
+ return pos < size;
+ }
+ public void next() {
+ update(pos+1);
+ }
+ public String toString() {
+ return new String(ngram) + "(" + count + ")";
+ }
+ }
+ }
+ private Interleaved updateInterleaved() {
+ interleaved.update();
+ return interleaved;
+ }
}
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java?rev=1659759&r1=1659758&r2=1659759&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
Sat Feb 14 07:49:54 2015
@@ -25,6 +25,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Writer;
import java.util.HashMap;
+import java.util.Locale;
import org.apache.tika.io.IOUtils;
import org.junit.Before;
@@ -49,7 +50,7 @@ public class LanguageIdentifierTest {
public void setUp() {
LanguageIdentifier.initProfiles();
}
-
+
@Test
public void testLanguageDetection() throws IOException {
for (String language : languages) {
@@ -105,8 +106,43 @@ public class LanguageIdentifierTest {
assertTrue(identifier.isReasonablyCertain());
}
- @Test
- public void testMixedLanguages() throws IOException {
+ // Enable this to compare performance
+ public void testPerformance() throws IOException {
+ final int MRUNS = 8;
+ final int IRUNS = 10;
+ int detected = 0; // To avoid code removal by JVM or compiler
+ String lastResult = null;
+ for (int m = 0 ; m < MRUNS ; m++) {
+ LanguageProfile.useInterleaved = (m & 1) == 1; // Alternate
between standard and interleaved
+ String currentResult = "";
+ final long start = System.nanoTime();
+ for (int i = 0 ; i < IRUNS ; i++) {
+ for (String language : languages) {
+ ProfilingWriter writer = new ProfilingWriter();
+ writeTo(language, writer);
+ LanguageIdentifier identifier = new
LanguageIdentifier(writer.getProfile());
+ if (identifier.isReasonablyCertain()) {
+ currentResult += identifier.getLanguage();
+ detected++;
+ }
+ }
+ }
+ System.out.println(String.format(Locale.ROOT,
+ "Performed %d detections at %2d ms/test with
interleaved=%b",
+ languages.length*IRUNS,
(System.nanoTime()-start)/1000000/(languages.length*IRUNS),
+ LanguageProfile.useInterleaved));
+ if (lastResult != null) { // Might as well test that they behave
the same while we're at it
+ assertEquals("This result should be equal to the last",
lastResult, currentResult);
+ }
+ lastResult = currentResult;
+ }
+ if (detected == -1) {
+ System.out.println("Never encountered but keep it to guard against
over-eager optimization");
+ }
+ }
+
+ @Test
+ public void testMixedLanguages() throws IOException {
for (String language : languages) {
for (String other : languages) {
if (!language.equals(other)) {