Author: jukka
Date: Wed Oct 14 19:45:02 2009
New Revision: 825239

URL: http://svn.apache.org/viewvc?rev=825239&view=rev
Log:
TIKA-310: Use TagSoup to parse HTML

Modified:
    lucene/tika/trunk/tika-parsers/pom.xml
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Modified: lucene/tika/trunk/tika-parsers/pom.xml
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/pom.xml?rev=825239&r1=825238&r2=825239&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/pom.xml (original)
+++ lucene/tika/trunk/tika-parsers/pom.xml Wed Oct 14 19:45:02 2009
@@ -92,9 +92,9 @@
       <version>1.1.1</version>
     </dependency>
     <dependency>
-      <groupId>net.sourceforge.nekohtml</groupId>
-      <artifactId>nekohtml</artifactId>
-      <version>1.9.9</version>
+      <groupId>org.ccil.cowan.tagsoup</groupId>
+      <artifactId>tagsoup</artifactId>
+      <version>1.2</version>
     </dependency>
     <dependency>
       <groupId>asm</groupId>

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=825239&r1=825238&r2=825239&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 Wed Oct 14 19:45:02 2009
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -35,7 +35,6 @@
 import org.apache.tika.sax.xpath.Matcher;
 import org.apache.tika.sax.xpath.MatchingContentHandler;
 import org.apache.tika.sax.xpath.XPathParser;
-import org.cyberneko.html.parsers.SAXParser;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.InputSource;
@@ -117,7 +116,8 @@
                 new MatchingContentHandler(getMetaHandler(metadata), meta));
 
         // Parse the HTML document
-        SAXParser parser = new SAXParser();
+        org.ccil.cowan.tagsoup.Parser parser =
+            new org.ccil.cowan.tagsoup.Parser();
         parser.setContentHandler(new XHTMLDowngradeHandler(handler));
         parser.parse(source);
     }


Reply via email to