Author: jukka
Date: Thu Aug 13 22:26:36 2009
New Revision: 804046

URL: http://svn.apache.org/viewvc?rev=804046&view=rev
Log:
TIKA-209: Language detection is weak

Add test case for ProfilingWriter and adapt LanguageIdentifier to better 
support using the ProfilingWriter.

Added:
    
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java
Modified:
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=804046&r1=804045&r2=804046&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
 (original)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
 Thu Aug 13 22:26:36 2009
@@ -66,20 +66,8 @@
     /**
      * Constructs a new Language Identifier.
      */
-    public LanguageIdentifier() {
-
-        // Gets ngram sizes to take into account from the Nutch Config
-        minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;
-        maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;
-        // Ensure the min and max values are in an acceptale range
-        // (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <= 
DEFAULT_MAX_NGRAM_LENGTH)
-        maxLength = Math.min(maxLength, 
NGramProfile.ABSOLUTE_MAX_NGRAM_LENGTH);
-        maxLength = Math.max(maxLength, 
NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
-        minLength = Math.max(minLength, 
NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
-        minLength = Math.min(minLength, maxLength);
-
-        // Gets the value of the maximum size of data to analyze
-        analyzeLength = DEFAULT_ANALYSIS_LENGTH;
+    public LanguageIdentifier(NGramProfile suspect) {
+        this.suspect = suspect;
 
         Properties p = new Properties();
         try {
@@ -128,14 +116,19 @@
                     ngramsIdx.put(entry.getSeq(), array);
                 }
             }
-            // Create the suspect profile
-            suspect = new NGramProfile("suspect", minLength, maxLength);
         } catch (Exception e) {
             e.printStackTrace();
             // if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
         }
     }
 
+    public LanguageIdentifier() {
+        this(new NGramProfile(
+                "suspect",
+                NGramProfile.DEFAULT_MIN_NGRAM_LENGTH,
+                NGramProfile.DEFAULT_MAX_NGRAM_LENGTH));
+    }
+
     /**
      * Main method used for command line process.
      * <br/>Usage is:
@@ -298,8 +291,12 @@
             text = new StringBuilder().append(content);
             text.setLength(analyzeLength);
         }
-
         suspect.analyze(text);
+
+        return identify();
+    }
+
+    public String identify() {
         Iterator<NGramEntry> iter = suspect.getSorted().iterator();
         float topscore = Float.MIN_VALUE;
         String lang = "";

Added: 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java?rev=804046&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java
 (added)
+++ 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/language/ProfilingWriterTest.java
 Thu Aug 13 22:26:36 2009
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.language;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+import org.apache.tika.io.IOUtils;
+
+import junit.framework.TestCase;
+
+public class ProfilingWriterTest extends TestCase {
+
+    public void testProfilingWriter() throws IOException {
+        assertProfile("da");
+        assertProfile("de");
+        assertProfile("el");
+        assertProfile("en");
+        assertProfile("es");
+        assertProfile("fi");
+        assertProfile("fr");
+        assertProfile("it");
+        assertProfile("nl");
+        assertProfile("pt");
+        assertProfile("sv");
+    }
+
+    private void assertProfile(String lang) throws IOException {
+        InputStream stream =
+            ProfilingWriterTest.class.getResourceAsStream(lang + ".test");
+        try {
+            ProfilingWriter writer = new ProfilingWriter();
+            IOUtils.copy(new InputStreamReader(stream, "UTF-8"), writer);
+            NGramProfile profile = writer.getProfile();
+            assertEquals(lang, new LanguageIdentifier(profile).identify());
+        } finally {
+            stream.close();
+        }
+    }
+
+}


Reply via email to