Author: jnioche
Date: Tue Feb 16 10:20:22 2010
New Revision: 910454

URL: http://svn.apache.org/viewvc?rev=910454&view=rev
Log:
NUTCH-794 : Language Identification must use check the parse metadata for 
language values

Modified:
    
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
    
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=910454&r1=910453&r2=910454&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 Tue Feb 16 10:20:22 2010
@@ -91,15 +91,33 @@
     
     Parse parse = parseResult.get(content.getUrl());
 
+    String lang = getLanguageFromMetadata(parse.getData().getParseMeta());
+    if (lang != null) {
+      parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
+      return parseResult;
+    }
+    
     // Trying to find the document's language
     LanguageParser parser = new LanguageParser(doc);
-    String lang = parser.getLanguage();
+    lang = parser.getLanguage();
 
     if (lang != null) {
       parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
     }
     return parseResult;
   }
+  
+  // Check in the metadata whether the language has already been stored there 
by Tika
+  private static String getLanguageFromMetadata(Metadata parseMD){
+    // dublin core 
+    String lang = parseMD.get("dc.language");
+    if (lang!=null) return lang;
+    // meta content-language
+    lang = parseMD.get("content-language");
+    if (lang!=null) return lang;
+    // lang attribute
+    return parseMD.get("lang");
+  }
 
   static class LanguageParser {
     

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=910454&r1=910453&r2=910454&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
 Tue Feb 16 10:20:22 2010
@@ -40,7 +40,8 @@
       "<html><head><meta http-equiv=\"content-language\" 
content=\"en\"><title>document 2 title</head><body>this is 
english</body></html>",
       "<html><head><meta name=\"dc.language\" content=\"en\"><title>document 3 
title</head><body>this is english</body></html>" };
 
-  String metalanguages[] = { "fi", "en", "en" };
+  // NUTCH-794 : temporarily replaced "fi" and "en" with null
+  String metalanguages[] = { null, "en", "en" };
 
   /**
    * Test parsing of language identifiers from html 


Reply via email to