Author: jukka
Date: Sun Dec 13 00:23:29 2009
New Revision: 890010

URL: http://svn.apache.org/viewvc?rev=890010&view=rev
Log:
TIKA-334: HtmlParser should use CharsetDetector whenever no charset is 
specified via meta http-equiv tag

Note that the BufferedInputStream wrapper needs to be applied *outside* the 
getEncoding() method so that the mark()/reset() operations affect the stream 
instance that's later passed to tagsoup for parsing.

Also rearranged some comments.

Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=890010&r1=890009&r2=890010&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 Sun Dec 13 00:23:29 2009
@@ -49,13 +49,13 @@
             "(?is)<meta\\s+http-equiv\\s*=\\s*['\"]\\s*Content-Type['\"]\\s+"
             + "content\\s*=\\s*['\"][^;]+;\\s*charset\\s*=\\s*([^'\"]+)\"");
 
-    // TODO: Move this into core, along with CharsetDetector
+    /**
+     * TIKA-332: Check for meta http-equiv tag with charset info in
+     * HTML content.
+     * <p>
+     * TODO: Move this into core, along with CharsetDetector
+     */ 
     private String getEncoding(InputStream stream, Metadata metadata) throws 
IOException {
-        // TIKA-332: Check for meta http-equiv tag with charset info in HTML 
content
-        if (!stream.markSupported()) {
-            stream = new BufferedInputStream(stream);
-        }
-
         stream.mark(META_TAG_BUFFER_SIZE);
         char[] buffer = new char[META_TAG_BUFFER_SIZE];
         InputStreamReader isr = new InputStreamReader(stream, "us-ascii");
@@ -118,7 +118,12 @@
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
-    throws IOException, SAXException, TikaException {
+            throws IOException, SAXException, TikaException {
+        // The getEncoding() method depends on the mark feature
+        if (!stream.markSupported()) {
+            stream = new BufferedInputStream(stream);
+        }
+
         // Protect the stream from being closed by CyberNeko
         // TODO: Is this still needed, given our use of TagSoup?
         stream = new CloseShieldInputStream(stream);


Reply via email to