Author: jukka
Date: Thu Sep 10 22:45:10 2009
New Revision: 813626

URL: http://svn.apache.org/viewvc?rev=813626&view=rev
Log:
TIKA-273: Content encoding in HtmlParser

Based on suggestion by Piotr B.

Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=813626&r1=813625&r2=813626&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 Thu Sep 10 22:45:10 2009
@@ -94,6 +94,13 @@
         // Protect the stream from being closed by CyberNeko
         stream = new CloseShieldInputStream(stream);
 
+        // Prepare the input source using the encoding hint if available
+        InputSource source = new InputSource(stream); 
+        String encoding = metadata.get(Metadata.CONTENT_ENCODING); 
+        if (encoding != null) { 
+            source.setEncoding(encoding);
+        }
+
         // Prepare the HTML content handler that generates proper
         // XHTML events to records relevant document metadata
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
@@ -109,7 +116,7 @@
         // Parse the HTML document
         SAXParser parser = new SAXParser();
         parser.setContentHandler(new XHTMLDowngradeHandler(handler));
-        parser.parse(new InputSource(stream));
+        parser.parse(source);
     }
 
     private ContentHandler getTitleHandler(final Metadata metadata) {


Reply via email to