Author: rfrovarp
Date: Wed Nov 23 22:10:37 2011
New Revision: 1205631
URL: http://svn.apache.org/viewvc?rev=1205631&view=rev
Log:
Give the TikaDocumentParser all of the power that the HTML parser has.
Since they are the same, mark the TikaHtmlParser as deprecated because the
DocumentParser has a more generic name.
Modified:
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
Modified:
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java?rev=1205631&r1=1205630&r2=1205631&view=diff
==============================================================================
---
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
(original)
+++
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
Wed Nov 23 22:10:37 2011
@@ -20,7 +20,20 @@ package org.apache.droids.tika;
import java.io.IOException;
import java.io.InputStream;
-
+import java.io.StringWriter;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.LinkTask;
import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Link;
import org.apache.droids.exception.DroidsException;
@@ -31,25 +44,65 @@ import org.apache.tika.exception.TikaExc
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.LinkContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.SAXException;
public class TikaDocumentParser implements TikaParser {
+ protected static final Log log = LogFactory.getLog(TikaDocumentParser.class);
+
@Override
public TikaParse parse(ContentEntity entity, Link link) throws
DroidsException,
IOException {
+ // Init Tika objects
org.apache.tika.parser.Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
- BodyContentHandler handler = new BodyContentHandler(-1);
+ String charset = entity.getCharset();
+ if (charset == null) {
+ charset = "UTF-8";
+ }
+
+ StringWriter dataBuffer = new StringWriter();
+ StringWriter bodyBuffer = new StringWriter();
+ StringWriter mainContentBuffer = new StringWriter();
+
+ SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
+ TransformerHandler xmlHandler;
+ try {
+ xmlHandler = factory.newTransformerHandler();
+ } catch (TransformerConfigurationException e) {
+ throw new DroidsException(e);
+ }
+ xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ xmlHandler.setResult(new StreamResult(dataBuffer));
+
+ BoilerpipeContentHandler mainContentHandler = new
BoilerpipeContentHandler(mainContentBuffer);
+ BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
+ LinkContentHandler linkHandler = new LinkContentHandler();
+
+ TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler,
mainContentHandler, bodyHandler, linkHandler );
+
InputStream instream = entity.obtainContent();
try {
- parser.parse(instream, handler, metadata, new ParseContext());
- TikaParseImpl parse = new TikaParseImpl(handler.toString(),null);
+ parser.parse(instream, parallelHandler, metadata, new ParseContext());
- return parse;
-
+ ArrayList<Link> extractedTasks = new ArrayList<Link>();
+ int depth = link.getDepth() + 1;
+ for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
+ try {
+ extractedTasks.add(new LinkTask(link, new URI(tikaLink.getUri()),
depth, tikaLink.getText()));
+ } catch (URISyntaxException e) {
+ if(log.isWarnEnabled()) {
+ log.warn("URI not valid: "+ tikaLink.getUri());
+ }
+ }
+ }
+
+ return new TikaParseImpl(dataBuffer.toString(), extractedTasks,
bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
} catch (SAXException ex) {
throw new DroidsException("Failure parsing document " + link.getId(),
ex);
} catch (TikaException ex) {
Modified:
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=1205631&r1=1205630&r2=1205631&view=diff
==============================================================================
---
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
(original)
+++
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
Wed Nov 23 22:10:37 2011
@@ -48,6 +48,11 @@ import org.apache.tika.sax.LinkContentHa
import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.SAXException;
+/**
+ *
+ * @deprecated Use TikaDocumentParser instead as it handles HTML just fine and
performs the same operations.
+ *
+ */
public class TikaHtmlParser implements TikaParser {
protected static final Log log = LogFactory.getLog(TikaHtmlParser.class);