Author: jukka
Date: Tue Mar 24 12:07:19 2009
New Revision: 757751
URL: http://svn.apache.org/viewvc?rev=757751&view=rev
Log:
TIKA-208: Special characters in HTML file are not parsed correctly
Use the HTML-specific encoding detection in NekoHtml instead of the more
generic ICU4J one.
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=757751&r1=757750&r2=757751&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Tue Mar 24 12:07:19 2009
@@ -34,7 +34,6 @@
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
-import org.apache.tika.utils.Utils;
import org.cyberneko.html.parsers.SAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
@@ -105,7 +104,7 @@
// Parse the HTML document
SAXParser parser = new SAXParser();
parser.setContentHandler(new XHTMLDowngradeHandler(handler));
- parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
+ parser.parse(new InputSource(stream));
}
private ContentHandler getTitleHandler(final Metadata metadata) {