Author: jukka
Date: Tue Aug 23 09:43:27 2011
New Revision: 1160599
URL: http://svn.apache.org/viewvc?rev=1160599&view=rev
Log:
TIKA-434: Bug in TagSoup causes IOException
Upgrade to TagSoup 1.2.1. No more need for the BufferedReader workaround.
Modified:
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Modified: tika/trunk/tika-parsers/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1160599&r1=1160598&r2=1160599&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Tue Aug 23 09:43:27 2011
@@ -115,7 +115,7 @@
<dependency>
<groupId>org.ccil.cowan.tagsoup</groupId>
<artifactId>tagsoup</artifactId>
- <version>1.2</version>
+ <version>1.2.1</version>
</dependency>
<dependency>
<groupId>asm</groupId>
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=1160599&r1=1160598&r2=1160599&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Tue Aug 23 09:43:27 2011
@@ -31,13 +31,10 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.apache.tika.utils.CharsetUtils;
-import org.ccil.cowan.tagsoup.HTMLScanner;
import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.ScanHandler;
import org.ccil.cowan.tagsoup.Schema;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
@@ -202,14 +199,6 @@ public class HtmlParser extends Abstract
parser.setContentHandler(new XHTMLDowngradeHandler(
new HtmlHandler(mapper, handler, metadata)));
- parser.setProperty(org.ccil.cowan.tagsoup.Parser.scannerProperty,
- new HTMLScanner() {
- @Override
- public void scan(Reader r0, ScanHandler h) throws
IOException, SAXException {
- super.scan(new PushbackReader(new BufferedReader(r0),
2), h);
- }
- });
-
parser.parse(source);
}