Author: jukka
Date: Wed Oct 14 19:45:02 2009
New Revision: 825239
URL: http://svn.apache.org/viewvc?rev=825239&view=rev
Log:
TIKA-310: Use TagSoup to parse HTML
Modified:
lucene/tika/trunk/tika-parsers/pom.xml
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Modified: lucene/tika/trunk/tika-parsers/pom.xml
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/pom.xml?rev=825239&r1=825238&r2=825239&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/pom.xml (original)
+++ lucene/tika/trunk/tika-parsers/pom.xml Wed Oct 14 19:45:02 2009
@@ -92,9 +92,9 @@
<version>1.1.1</version>
</dependency>
<dependency>
- <groupId>net.sourceforge.nekohtml</groupId>
- <artifactId>nekohtml</artifactId>
- <version>1.9.9</version>
+ <groupId>org.ccil.cowan.tagsoup</groupId>
+ <artifactId>tagsoup</artifactId>
+ <version>1.2</version>
</dependency>
<dependency>
<groupId>asm</groupId>
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=825239&r1=825238&r2=825239&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Wed Oct 14 19:45:02 2009
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -35,7 +35,6 @@
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
-import org.cyberneko.html.parsers.SAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
@@ -117,7 +116,8 @@
new MatchingContentHandler(getMetaHandler(metadata), meta));
// Parse the HTML document
- SAXParser parser = new SAXParser();
+ org.ccil.cowan.tagsoup.Parser parser =
+ new org.ccil.cowan.tagsoup.Parser();
parser.setContentHandler(new XHTMLDowngradeHandler(handler));
parser.parse(source);
}