Author: jukka
Date: Sun Dec 13 22:10:33 2009
New Revision: 890130

URL: http://svn.apache.org/viewvc?rev=890130&view=rev
Log:
TIKA-339: HtmlParser & TXTParser should not use language returned by 
CharsetDetector if language hint has been provided

Patch by Ken Krugler

Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=890130&r1=890129&r2=890130&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 Sun Dec 13 22:10:33 2009
@@ -106,13 +106,18 @@
             if (Charset.isSupported(match.getName())) {
                 metadata.set(Metadata.CONTENT_ENCODING, match.getName());
 
-                // Is the encoding language-specific (KOI8-R, SJIS, etc.)?
+                // TIKA-339: Don't set language, as it's typically not a very 
good
+                // guess, and it can create ambiguity if another (better) 
language
+                // value is specified by a meta tag in the HTML (or via HTTP 
response
+                // header).
+                /*
                 String language = match.getLanguage();
                 if (language != null) {
                     metadata.set(Metadata.CONTENT_LANGUAGE, 
match.getLanguage());
                     metadata.set(Metadata.LANGUAGE, match.getLanguage());
                 }
-
+                */
+                
                 break;
             }
         }

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=890130&r1=890129&r2=890130&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
 Sun Dec 13 22:10:33 2009
@@ -100,8 +100,8 @@
                 // Is the encoding language-specific (KOI8-R, SJIS, etc.)?
                 String language = match.getLanguage();
                 if (language != null) {
-                    metadata.set(Metadata.CONTENT_LANGUAGE, 
match.getLanguage());
-                    metadata.set(Metadata.LANGUAGE, match.getLanguage());
+                    metadata.add(Metadata.CONTENT_LANGUAGE, language);
+                    metadata.add(Metadata.LANGUAGE, language);
                 }
 
                 break;

Modified: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=890130&r1=890129&r2=890130&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Sun Dec 13 22:10:33 2009
@@ -292,4 +292,19 @@
         assertEquals("baz", parts[2]);
     }
 
+    /**
+     * Test case for TIKA-339: Don't use language returned by CharsetDetector
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-339";>TIKA-339</a>
+     */
+    public void testIgnoreCharsetDetectorLanguage() throws Exception {
+        String test = "<html><title>Simple 
Content</title><body></body></html>";
+        Metadata metadata = new Metadata();
+        metadata.add(Metadata.CONTENT_LANGUAGE, "en");
+        new HtmlParser().parse (
+                new ByteArrayInputStream(test.getBytes("UTF-8")),
+                new BodyContentHandler(),  metadata, new ParseContext());
+
+        assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
+    }
+
 }

Modified: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=890130&r1=890129&r2=890130&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
 Sun Dec 13 22:10:33 2009
@@ -161,4 +161,22 @@
         assertEquals(msg, expected, handler.toString());
     }
 
+    /**
+     * Test case for TIKA-339: don't override incoming language
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-335";>TIKA-335</a> 
+     */
+    public void testRetainIncomingLanguage() throws Exception {
+        final String test = "Simple Content";
+
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.LANGUAGE, "en");
+
+        parser.parse(
+                new ByteArrayInputStream(test.getBytes("UTF-8")),
+                new BodyContentHandler(),  metadata, new ParseContext());
+
+        assertEquals("en", metadata.get(Metadata.LANGUAGE));
+    }
+
 }


Reply via email to