svn commit: r1414617 - in /stanbol/trunk/enhancer/engines/htmlextractor: ./ src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/ src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/

wkasper Wed, 28 Nov 2012 01:50:56 -0800

Author: wkasper
Date: Wed Nov 28 09:50:23 2012
New Revision: 1414617

URL: http://svn.apache.org/viewvc?rev=1414617&view=rev
Log:
STANBOL-770: Replace JTidy Html Parser by JSoup Parser


Added:
    
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMBuilder.java
Modified:
    stanbol/trunk/enhancer/engines/htmlextractor/pom.xml
    
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMUtils.java
    
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/HtmlParser.java
    
stanbol/trunk/enhancer/engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java

Modified: stanbol/trunk/enhancer/engines/htmlextractor/pom.xml
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/htmlextractor/pom.xml?rev=1414617&r1=1414616&r2=1414617&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/htmlextractor/pom.xml (original)
+++ stanbol/trunk/enhancer/engines/htmlextractor/pom.xml Wed Nov 28 09:50:23 
2012
@@ -62,7 +62,7 @@
               org.apache.stanbol.enhancer.engines.htmlextractor.*
             </Private-Package>
             <Embed-Dependency>
-               jtidy;scope=compile
+               jsoup;scope=compile
             </Embed-Dependency>
           </instructions>
         </configuration>
@@ -137,15 +137,15 @@
       <artifactId>junit</artifactId>
       <scope>test</scope>
     </dependency>
-
     <dependency>
        <groupId>com.ibm.icu</groupId>
        <artifactId>icu4j</artifactId>
-    </dependency>
+    </dependency>              
     <dependency>
-       <groupId>net.sf.jtidy</groupId>
-       <artifactId>jtidy</artifactId>
-       <version>r938</version>
-    </dependency>
+                 <groupId>org.jsoup</groupId>
+                 <artifactId>jsoup</artifactId>
+                 <version>1.7.1</version>
+               </dependency>
        </dependencies>
+    
 </project>

Added: 
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMBuilder.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMBuilder.java?rev=1414617&view=auto
==============================================================================
--- 
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMBuilder.java
 (added)
+++ 
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMBuilder.java
 Wed Nov 28 09:50:23 2012
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.htmlextractor.impl;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+
+/**
+ *
+ * @author <a href="mailto:[email protected]";>Walter Kasper</a>
+ * 
+ */
+
+public class DOMBuilder {
+  /**
+   * Returns a W3C DOM that exposes the same content as the supplied Jsoup 
document into a W3C DOM.
+   * @param jsoupDocument The Jsoup document to convert.
+   * @return A W3C Document.
+   */
+  public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) {
+    
+    Document document = null;
+    
+    try {
+      
+      /* Obtain the document builder for the configured XML parser. */
+      DocumentBuilderFactory docBuilderFactory = 
DocumentBuilderFactory.newInstance();
+      DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
+      
+      /* Create a document to contain the content. */
+      document = docBuilder.newDocument();
+      createDOM(jsoupDocument, document, document, new 
HashMap<String,String>());
+      
+    } catch (ParserConfigurationException pce) {
+      throw new RuntimeException(pce);
+    }
+    
+    return document;
+  }
+  
+  /**
+   * The internal helper that copies content from the specified Jsoup 
<tt>Node</tt> into a W3C {@link Node}.
+   * @param node The Jsoup node containing the content to copy to the 
specified W3C {@link Node}.
+   * @param out The W3C {@link Node} that receives the DOM content.
+   */
+  private static void createDOM(org.jsoup.nodes.Node node, Node out, Document 
doc, Map<String,String> ns) {
+         
+    if (node instanceof org.jsoup.nodes.Document) {
+      
+      org.jsoup.nodes.Document d = ((org.jsoup.nodes.Document) node);
+      for (org.jsoup.nodes.Node n : d.childNodes()) {
+        createDOM(n, out,doc,ns);
+      }
+      
+    } else if (node instanceof org.jsoup.nodes.Element) {
+      
+      org.jsoup.nodes.Element e = ((org.jsoup.nodes.Element) node);
+      org.w3c.dom.Element _e = doc.createElement(e.tagName());
+      out.appendChild(_e);
+      org.jsoup.nodes.Attributes atts = e.attributes();
+      
+      for(org.jsoup.nodes.Attribute a : atts){
+        String attName = a.getKey();
+        //omit xhtml namespace
+        if (attName.equals("xmlns")) {
+          continue;
+        }
+        String attPrefix = getNSPrefix(attName);
+        if (attPrefix != null) {
+          if (attPrefix.equals("xmlns")) {
+            ns.put(getLocalName(attName), a.getValue());
+          }
+          else if (!attPrefix.equals("xml")) {
+            String namespace = ns.get(attPrefix);
+            if (namespace == null) {
+              //fix attribute names looking like qnames
+              attName = attName.replace(':','_');
+            }
+          }
+        }
+        _e.setAttribute(attName, a.getValue());
+      }
+      
+      for (org.jsoup.nodes.Node n : e.childNodes()) {
+        createDOM(n, _e, doc,ns);
+      }
+      
+    } else if (node instanceof org.jsoup.nodes.TextNode) {
+      
+      org.jsoup.nodes.TextNode t = ((org.jsoup.nodes.TextNode) node);
+      if (!(out instanceof Document)) {
+        out.appendChild(doc.createTextNode(t.text()));
+      }
+    }
+  }
+  
+  // some hacks for handling namespace in jsoup2DOM conversion
+  private static String getNSPrefix(String name) {
+    if (name != null) {
+      int pos = name.indexOf(':');
+      if (pos > 0) {
+        return name.substring(0,pos);
+      }
+    }
+    return null;
+  }
+  
+  private static String getLocalName(String name) {
+    if (name != null) {
+      int pos = name.lastIndexOf(':');
+      if (pos > 0) {
+        return name.substring(pos+1);
+      }
+    }
+    return name;
+  }
+
+}

Modified: 
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMUtils.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMUtils.java?rev=1414617&r1=1414616&r2=1414617&view=diff
==============================================================================
--- 
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMUtils.java
 (original)
+++ 
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMUtils.java
 Wed Nov 28 09:50:23 2012
@@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.OutputStream;
+import java.io.PrintStream;
 import java.io.StringWriter;
 import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
@@ -57,6 +58,85 @@ import org.xml.sax.SAXParseException;
  */
 public final class DOMUtils {
 
+  /** 
+   * This prints the specified node and all of its children to a PrintStream.
+   * 
+   * @param node a DOM <code>Node</code>
+   */
+  public static void printDOM(Node node, PrintStream out) {
+    
+    int type = node.getNodeType();
+    switch (type) {
+      // print the document element
+      case Node.DOCUMENT_NODE: 
+        out.println("<?xml version=\"1.0\" ?>");
+        printDOM(((Document)node).getDocumentElement(),out);
+        break;
+
+        // print element with attributes
+      case Node.ELEMENT_NODE: 
+        out.print("<");
+        out.print(node.getNodeName());
+        NamedNodeMap attrs = node.getAttributes();
+        for (int i = 0; i < attrs.getLength(); i++) {
+          Node attr = attrs.item(i);
+          out.print(" " + attr.getNodeName().trim() + "=\""
+            + quoteXMLChars(attr.getNodeValue().trim()) + "\"");
+        }
+        out.println(">");
+        
+        NodeList children = node.getChildNodes();
+        if (children != null) {
+          int len = children.getLength();
+          for (int i = 0; i < len; i++) {
+            printDOM(children.item(i),out);
+          }
+        }
+        
+        break;
+        
+        // handle entity reference nodes
+      case Node.ENTITY_REFERENCE_NODE:
+        out.print("&");
+        out.print(node.getNodeName().trim());
+        out.print(";");
+        break;
+        
+        // print cdata sections
+      case Node.CDATA_SECTION_NODE:
+        out.print("<![CDATA[");
+        out.print(node.getNodeValue().trim());
+        out.print("]]>");
+        break;
+        
+        // print text
+      case Node.TEXT_NODE:
+        out.print(quoteXMLChars(node.getNodeValue().trim()));
+        break;
+        
+        // print processing instruction
+      case Node.PROCESSING_INSTRUCTION_NODE:
+        out.print("<?");
+        out.print(node.getNodeName().trim());
+        String data = node.getNodeValue().trim();
+        out.print(" ");
+        out.print(data);
+        out.print("?>");
+        break;
+        
+      default:
+        System.err.println("unknown type " + type);
+        break;
+    }
+    
+    if (type == Node.ELEMENT_NODE) {
+      out.println();
+      out.print("</");
+      out.print(node.getNodeName().trim());
+      out.println('>');
+    }
+  }
+
     /**
      * This prints the given DOM document to System.out with indentation and
      * utf-8 encoding.
@@ -497,5 +577,13 @@ public final class DOMUtils {
         ele.appendChild(child);
         return child;
     }
-
+    
+    public static String quoteXMLChars(String text) {
+      if (text != null) {
+        return text.replace("&", "&amp;").replace("<","&lt;").replace(">", 
"&gt;").replace("\"", "&quot;").replace("'", "&apos;");
+      }
+      return text;
+    }
+ 
 }
+

Modified: 
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/HtmlParser.java?rev=1414617&r1=1414616&r2=1414617&view=diff
==============================================================================
--- 
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/HtmlParser.java
 (original)
+++ 
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/HtmlParser.java
 Wed Nov 28 09:50:23 2012
@@ -17,15 +17,17 @@
 package org.apache.stanbol.enhancer.engines.htmlextractor.impl;
 
 import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
 
-import javax.xml.parsers.ParserConfigurationException;
-
+import org.jsoup.Jsoup;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
-import org.w3c.tidy.Tidy;
 
 /**
  * HtmlParser.java
@@ -40,36 +42,65 @@ public class HtmlParser {
      */
     private static final Logger LOG = 
LoggerFactory.getLogger(HtmlParser.class);
 
-    private Tidy htmlToXmlParser;
-
+    private String baseURI = "";
+    
     public HtmlParser() {
-        this.htmlToXmlParser = new Tidy();
-        this.htmlToXmlParser.setTidyMark(false);
-        this.htmlToXmlParser.setDropEmptyParas(true);
-        this.htmlToXmlParser.setQuiet(true);
-        this.htmlToXmlParser.setQuoteAmpersand(true);
-        this.htmlToXmlParser.setShowWarnings(false);
-        this.htmlToXmlParser.setShowErrors(0);
-        this.htmlToXmlParser.setNumEntities(true);
-        this.htmlToXmlParser.setHideComments(true);
-        this.htmlToXmlParser.setOutputEncoding("UTF-8");
-        this.htmlToXmlParser.setXmlOut(true);
     }
 
 
-    public Document getDOM(String html) {
-        if (html != null) {
-            return getDOM(new ByteArrayInputStream(html.getBytes()), null);
-        }
-        return null;
+    /**
+     * @return the baseURI
+     */
+    public String getBaseURI() {
+      return baseURI;
+    }
+
+
+    /**
+     * @param baseURI the baseURI to set
+     */
+    public void setBaseURI(String baseURI) {
+      this.baseURI = baseURI;
     }
 
 
-    public synchronized Document getDOM(InputStream html, String charset) {
-        if (charset != null) {
-            htmlToXmlParser.setInputEncoding(charset);
+    public Document getDOM(String html) {        
+      if (html != null) {
+        return getDOM(new ByteArrayInputStream(html.getBytes()), null);
+      }
+      return null;
+    }
+
+    public Document getDOM(InputStream html, String charset) {
+        Document doc = null;
+        try {
+            doc = DOMBuilder.jsoup2DOM(Jsoup.parse(html, charset, baseURI));
+        } catch (RuntimeException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
         }
-        Document doc = htmlToXmlParser.parseDOM(html, null);
         return doc;
     }
+    
+    public static void main(String[] args) throws Exception {
+      int argv = 0;
+      String encoding = null;
+      while (argv < args.length && args[argv].startsWith("-")) {
+        if (args[argv].equals("-enc")) {
+          encoding = args[++argv];
+        }
+        ++argv;
+      }
+      HtmlParser parser = new HtmlParser();
+      for (int i = argv; i < args.length; ++i) {
+//        parser.setBaseURI(new File(args[i]).toURI().toString());
+        InputStream is = new FileInputStream(args[i]);
+        Document doc = parser.getDOM(is,encoding);
+        OutputStream out = new FileOutputStream(new 
File(args[i]).getName()+".xml");
+        DOMUtils.writeXml(doc,"UTF-8",null,out);
+        out.close();
+        is.close();
+      }
+    }
 }

Modified: 
stanbol/trunk/enhancer/engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java?rev=1414617&r1=1414616&r2=1414617&view=diff
==============================================================================
--- 
stanbol/trunk/enhancer/engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java
 (original)
+++ 
stanbol/trunk/enhancer/engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java
 Wed Nov 28 09:50:23 2012
@@ -135,7 +135,7 @@ public class TestHtmlExtractor {
 
         // show triples
         int tripleCounter = model.size();
-        LOG.info("Triples: {}",tripleCounter);
+        LOG.debug("Triples: {}",tripleCounter);
         printTriples(model);
         Set<NonLiteral> roots = ClerezzaRDFUtils.findRoots(model);
         assertTrue(roots.size() > 1);

svn commit: r1414617 - in /stanbol/trunk/enhancer/engines/htmlextractor: ./ src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/ src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/

Reply via email to