Author: jukka
Date: Thu Sep 10 22:45:10 2009
New Revision: 813626
URL: http://svn.apache.org/viewvc?rev=813626&view=rev
Log:
TIKA-273: Content encoding in HtmlParser
Based on suggestion by Piotr B.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=813626&r1=813625&r2=813626&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Thu Sep 10 22:45:10 2009
@@ -94,6 +94,13 @@
// Protect the stream from being closed by CyberNeko
stream = new CloseShieldInputStream(stream);
+ // Prepare the input source using the encoding hint if available
+ InputSource source = new InputSource(stream);
+ String encoding = metadata.get(Metadata.CONTENT_ENCODING);
+ if (encoding != null) {
+ source.setEncoding(encoding);
+ }
+
// Prepare the HTML content handler that generates proper
// XHTML events to records relevant document metadata
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
@@ -109,7 +116,7 @@
// Parse the HTML document
SAXParser parser = new SAXParser();
parser.setContentHandler(new XHTMLDowngradeHandler(handler));
- parser.parse(new InputSource(stream));
+ parser.parse(source);
}
private ContentHandler getTitleHandler(final Metadata metadata) {