Author: jukka
Date: Thu Aug 13 22:26:36 2009
New Revision: 804046
URL: http://svn.apache.org/viewvc?rev=804046&view=rev
Log:
TIKA-209: Language detection is weak
Add test case for ProfilingWriter and adapt LanguageIdentifier to better
support using the ProfilingWriter.
Added:
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=804046&r1=804045&r2=804046&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
Thu Aug 13 22:26:36 2009
@@ -66,20 +66,8 @@
/**
* Constructs a new Language Identifier.
*/
- public LanguageIdentifier() {
-
- // Gets ngram sizes to take into account from the Nutch Config
- minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;
- maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;
- // Ensure the min and max values are in an acceptale range
- // (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <=
DEFAULT_MAX_NGRAM_LENGTH)
- maxLength = Math.min(maxLength,
NGramProfile.ABSOLUTE_MAX_NGRAM_LENGTH);
- maxLength = Math.max(maxLength,
NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
- minLength = Math.max(minLength,
NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
- minLength = Math.min(minLength, maxLength);
-
- // Gets the value of the maximum size of data to analyze
- analyzeLength = DEFAULT_ANALYSIS_LENGTH;
+ public LanguageIdentifier(NGramProfile suspect) {
+ this.suspect = suspect;
Properties p = new Properties();
try {
@@ -128,14 +116,19 @@
ngramsIdx.put(entry.getSeq(), array);
}
}
- // Create the suspect profile
- suspect = new NGramProfile("suspect", minLength, maxLength);
} catch (Exception e) {
e.printStackTrace();
// if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
}
}
+ public LanguageIdentifier() {
+ this(new NGramProfile(
+ "suspect",
+ NGramProfile.DEFAULT_MIN_NGRAM_LENGTH,
+ NGramProfile.DEFAULT_MAX_NGRAM_LENGTH));
+ }
+
/**
* Main method used for command line process.
* <br/>Usage is:
@@ -298,8 +291,12 @@
text = new StringBuilder().append(content);
text.setLength(analyzeLength);
}
-
suspect.analyze(text);
+
+ return identify();
+ }
+
+ public String identify() {
Iterator<NGramEntry> iter = suspect.getSorted().iterator();
float topscore = Float.MIN_VALUE;
String lang = "";
Added:
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java?rev=804046&view=auto
==============================================================================
---
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java
(added)
+++
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java
Thu Aug 13 22:26:36 2009
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.language;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+import org.apache.tika.io.IOUtils;
+
+import junit.framework.TestCase;
+
+public class ProfilingWriterTest extends TestCase {
+
+ public void testProfilingWriter() throws IOException {
+ assertProfile("da");
+ assertProfile("de");
+ assertProfile("el");
+ assertProfile("en");
+ assertProfile("es");
+ assertProfile("fi");
+ assertProfile("fr");
+ assertProfile("it");
+ assertProfile("nl");
+ assertProfile("pt");
+ assertProfile("sv");
+ }
+
+ private void assertProfile(String lang) throws IOException {
+ InputStream stream =
+ ProfilingWriterTest.class.getResourceAsStream(lang + ".test");
+ try {
+ ProfilingWriter writer = new ProfilingWriter();
+ IOUtils.copy(new InputStreamReader(stream, "UTF-8"), writer);
+ NGramProfile profile = writer.getProfile();
+ assertEquals(lang, new LanguageIdentifier(profile).identify());
+ } finally {
+ stream.close();
+ }
+ }
+
+}