Author: jukka
Date: Tue Mar 24 12:07:19 2009
New Revision: 757751

URL: http://svn.apache.org/viewvc?rev=757751&view=rev
Log:
TIKA-208: Special characters in HTML file are not parsed correctly

Use the HTML-specific encoding detection in NekoHtml instead of the more 
generic ICU4J one.

Modified:
    lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Modified: 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=757751&r1=757750&r2=757751&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java 
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java 
Tue Mar 24 12:07:19 2009
@@ -34,7 +34,6 @@
 import org.apache.tika.sax.xpath.Matcher;
 import org.apache.tika.sax.xpath.MatchingContentHandler;
 import org.apache.tika.sax.xpath.XPathParser;
-import org.apache.tika.utils.Utils;
 import org.cyberneko.html.parsers.SAXParser;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
@@ -105,7 +104,7 @@
         // Parse the HTML document
         SAXParser parser = new SAXParser();
         parser.setContentHandler(new XHTMLDowngradeHandler(handler));
-        parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
+        parser.parse(new InputSource(stream));
     }
 
     private ContentHandler getTitleHandler(final Metadata metadata) {


Reply via email to