Author: jukka
Date: Tue Aug 23 09:43:27 2011
New Revision: 1160599

URL: http://svn.apache.org/viewvc?rev=1160599&view=rev
Log:
TIKA-434: Bug in TagSoup causes IOException

Upgrade to TagSoup 1.2.1. No more need for the BufferedReader workaround.

Modified:
    tika/trunk/tika-parsers/pom.xml
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Modified: tika/trunk/tika-parsers/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1160599&r1=1160598&r2=1160599&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Tue Aug 23 09:43:27 2011
@@ -115,7 +115,7 @@
     <dependency>
       <groupId>org.ccil.cowan.tagsoup</groupId>
       <artifactId>tagsoup</artifactId>
-      <version>1.2</version>
+      <version>1.2.1</version>
     </dependency>
     <dependency>
       <groupId>asm</groupId>

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=1160599&r1=1160598&r2=1160599&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 Tue Aug 23 09:43:27 2011
@@ -31,13 +31,10 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
 import org.apache.tika.utils.CharsetUtils;
-import org.ccil.cowan.tagsoup.HTMLScanner;
 import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.ScanHandler;
 import org.ccil.cowan.tagsoup.Schema;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.InputSource;
@@ -202,14 +199,6 @@ public class HtmlParser extends Abstract
         parser.setContentHandler(new XHTMLDowngradeHandler(
                 new HtmlHandler(mapper, handler, metadata)));
 
-        parser.setProperty(org.ccil.cowan.tagsoup.Parser.scannerProperty,
-                new HTMLScanner() {
-                    @Override
-                    public void scan(Reader r0, ScanHandler h) throws 
IOException, SAXException {
-                        super.scan(new PushbackReader(new BufferedReader(r0), 
2), h);
-                    }
-                });
-
         parser.parse(source);
     }
 


Reply via email to