Author: olegk
Date: Fri Nov 14 11:53:13 2008
New Revision: 714116

URL: http://svn.apache.org/viewvc?rev=714116&view=rev
Log:
Use SAX parser / LinkExtractor instead of DOM fragment parser for HTML parsing 
in Droids Core

Modified:
    
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/SequentialTaskMaster.java
    
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
    
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
    
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
    
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java

Modified: 
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/SequentialTaskMaster.java
URL: 
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/SequentialTaskMaster.java?rev=714116&r1=714115&r2=714116&view=diff
==============================================================================
--- 
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/SequentialTaskMaster.java
 (original)
+++ 
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/impl/SequentialTaskMaster.java
 Fri Nov 14 11:53:13 2008
@@ -86,10 +86,10 @@
         }
         switch (result) {
         case WARN:
-          log.warn(ex.getMessage());
+          log.warn(ex.toString());
           break;
         case FATAL:
-          log.warn(ex.getMessage());
+          log.warn(ex.getMessage(), ex);
           terminated = true;
           break;
         }

Modified: 
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java?rev=714116&r1=714115&r2=714116&view=diff
==============================================================================
--- 
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
 (original)
+++ 
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
 Fri Nov 14 11:53:13 2008
@@ -18,31 +18,21 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URI;
-import java.net.URISyntaxException;
-import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.Map;
 
+import org.apache.droids.ParseData;
 import org.apache.droids.api.ContentEntity;
 import org.apache.droids.api.Link;
 import org.apache.droids.api.Parse;
 import org.apache.droids.api.Parser;
 import org.apache.droids.exception.ContentFormatViolationException;
 import org.apache.droids.exception.DroidsException;
-import org.apache.droids.exception.InvalidLinkException;
 import org.apache.droids.helper.Loggable;
-import org.apache.droids.LinkTask;
-import org.apache.droids.ParseData;
 import org.apache.droids.parse.ParseImpl;
-import org.apache.html.dom.HTMLDocumentImpl;
 import org.apache.xerces.xni.parser.XMLDocumentFilter;
 import org.cyberneko.html.filters.ElementRemover;
-import org.cyberneko.html.parsers.DOMFragmentParser;
-import org.w3c.dom.DocumentFragment;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
+import org.cyberneko.html.parsers.SAXParser;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.xml.sax.SAXNotRecognizedException;
@@ -76,36 +66,24 @@
     this.base = newLink.getURI();
     ParseData parseData = null;
     // setup filter chain
-    final XMLDocumentFilter[] filters = { getRemover() };
+    XMLDocumentFilter[] filters = { getRemover() };
     // create HTML parser
-    final DOMFragmentParser parser = getParser(filters);
-    final DocumentFragment node = new 
HTMLDocumentImpl().createDocumentFragment();
-    // parse document
-    // XMLInputSource source = new XMLInputSource(null, uri, uri);
+    SAXParser parser = getParser(filters);
+    LinkExtractor linkExtractor = new LinkExtractor(newLink, elements);
+    parser.setContentHandler(linkExtractor);
     InputStream instream = entity.obtainContent();
     try {
-      parser.parse(new InputSource(instream), node);
+      parser.parse(new InputSource(instream));
     } catch (SAXException ex) {
       throw new ContentFormatViolationException("Failure parsing HTML 
content", ex);
     } finally {
       instream.close();
     }
-    parseData = extract(node);
-    return new ParseImpl(newLink.getId(), parseData);
+    return new ParseImpl(newLink.getId(), new 
ParseData(linkExtractor.getLinks()));
   }
 
-  private ParseData extract(DocumentFragment node) throws InvalidLinkException 
{
-    final ArrayList<Link> links = new ArrayList<Link>();
-    try {
-      extractLinks(node, links, new HashSet<URI>());
-    } catch (URISyntaxException ex) {
-      throw new InvalidLinkException("Invalid URI: " + ex.getInput(), ex);
-    }
-    return new ParseData(links);
-  }
-
-  private DOMFragmentParser getParser(XMLDocumentFilter[] filters) {
-    final DOMFragmentParser parser = new DOMFragmentParser();
+  private SAXParser getParser(XMLDocumentFilter[] filters) {
+    SAXParser parser = new SAXParser();
     try {
       parser.setProperty("http://cyberneko.org/html/properties/filters";, 
filters);
       parser.setFeature(
@@ -138,49 +116,4 @@
     return remover;
   }
 
-  private void extractLinks(Node node, ArrayList<Link> links,
-      HashSet<URI> set) throws URISyntaxException {
-    if (node.getNodeType() == Node.ELEMENT_NODE) {
-      String nodeName = node.getNodeName().toLowerCase();
-      if (elements.containsKey(nodeName)) {
-        String value = elements.get(nodeName);
-        NamedNodeMap attrs = node.getAttributes();
-        for (int i = 0; i < attrs.getLength(); i++) {
-          Node attr = attrs.item(i);
-          String attrName = attr.getNodeName();
-          if (attrName.equalsIgnoreCase(value)) {
-            String ref = attr.getNodeValue();
-            URI newUri = null;
-            if(ref.startsWith("/")){
-              newUri = new URI(
-                  base.getScheme(), base.getUserInfo(), base.getHost(), 
base.getPort(), 
-                  ref, null, null);
-            }else if(!ref.toLowerCase().startsWith("javascript")){
-              newUri = base.resolve(new URI(ref));
-            }
-            if (newUri != null) {
-              // Link from, URI uri, int depth, String text
-              final LinkTask outlink = new LinkTask( link, newUri, 
link.getDepth()+1 );
-              if (log.isDebugEnabled()) {
-                log.debug("set size: "+set.size());
-                log.debug("outlink.getToUrl(): "+outlink.getURI());
-                log.debug("set.contains(outlink.getToUrl(): " + 
set.contains(newUri));
-              }
-              if (!set.contains(newUri)) {
-                set.add(newUri);
-                links.add(outlink);
-              }
-            }
-          }
-        }
-      }
-    }
-    final NodeList children = node.getChildNodes();
-    if (children != null) {
-      int len = children.getLength();
-      for (int i = 0; i < len; i++) {
-        extractLinks(children.item(i), links, set);
-      }
-    }
-  }
 }

Modified: 
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java?rev=714116&r1=714115&r2=714116&view=diff
==============================================================================
--- 
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
 (original)
+++ 
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
 Fri Nov 14 11:53:13 2008
@@ -37,19 +37,19 @@
   protected final Log log = LogFactory.getLog(this.getClass());
 
   /**
-   * List of links
+   * Base url for host reference
    */
-  private Collection<Link> links = new ArrayList<Link>();
+  private final Link base;
 
   /**
    * Map with the pair label-attribute for the accepted items
    */
-  private Map<String, String> elements;
+  private final Map<String, String> elements;
 
   /**
-   * Base url for host reference
+   * List of links
    */
-  private Link base = null;
+  private Collection<Link> links = new ArrayList<Link>();
 
   /**
    * Set of URIs visited yet
@@ -61,6 +61,12 @@
    */
   private URI link = null;
 
+  public LinkExtractor(Link base, Map<String, String> elements) {
+    super();
+    this.base = base;
+    this.elements = elements;
+  }
+  
   @Override
   public void startDocument() throws SAXException {
     history = new HashSet<String>();
@@ -113,10 +119,6 @@
     }
   }
 
-  public void setBase(Link base) {
-    this.base = base;
-  }
-
   public Collection<Link> getLinks() {
     return links;
   }
@@ -125,10 +127,6 @@
     return elements;
   }
 
-  public void setElements(Map<String, String> elements) {
-    this.elements = elements;
-  }
-
   /**
    * Transform a String into an URI.
    * @param target the URI in String format.

Modified: 
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
URL: 
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java?rev=714116&r1=714115&r2=714116&view=diff
==============================================================================
--- 
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
 (original)
+++ 
incubator/droids/trunk/droids-core/src/test/java/org/apache/droids/DroidsFactory.java
 Fri Nov 14 11:53:13 2008
@@ -13,6 +13,7 @@
 import org.apache.droids.helper.factories.ParserFactory;
 import org.apache.droids.helper.factories.ProtocolFactory;
 import org.apache.droids.helper.factories.URLFiltersFactory;
+import org.apache.droids.impl.DefaultTaskExceptionHandler;
 import org.apache.droids.impl.SequentialTaskMaster;
 import org.apache.droids.impl.SimpleTaskQueue;
 import org.apache.droids.parse.html.HtmlParser;
@@ -80,6 +81,7 @@
 
     SequentialTaskMaster<Link> taskMaster = new SequentialTaskMaster<Link>();
     taskMaster.setDelayTimer( simpleDelayTimer );
+    taskMaster.setExceptionHandler(new DefaultTaskExceptionHandler());
     
     CrawlingDroid crawler = new CrawlingDroid( simpleQueue, taskMaster );
     crawler.setFiltersFactory(filtersFactory);

Modified: 
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL: 
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=714116&r1=714115&r2=714116&view=diff
==============================================================================
--- 
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
 (original)
+++ 
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
 Fri Nov 14 11:53:13 2008
@@ -18,6 +18,8 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.droids.ParseData;
 import org.apache.droids.api.ContentEntity;
@@ -40,19 +42,19 @@
 
   private org.apache.tika.parser.Parser parser = null;
 
-  private LinkExtractor extractor = null;
+  private Map<String, String> elements= null;
 
-
-  public LinkExtractor getExtractor() {
-    return extractor;
+  public Map<String, String> getElements() {
+    if (elements == null) {
+      elements = new HashMap<String, String>();
+    }
+    return elements;
   }
 
-
-  public void setExtractor(LinkExtractor extractor) {
-    this.extractor = extractor;
+  public void setElements(Map<String, String> elements) {
+    this.elements = elements;
   }
 
-
   public Parse getParse(ContentEntity entity, Link link) throws IOException, 
DroidsException {
     // Init Tika objects
     parser = new AutoDetectParser();
@@ -63,7 +65,7 @@
       charset = "UTF-8";
     }
     EchoHandler data = new EchoHandler(charset); 
-    extractor.setBase(link);
+    LinkExtractor extractor = new LinkExtractor(link, elements);
     
     TeeContentHandler parallelHandler = new TeeContentHandler(data, extractor);
 


Reply via email to