Author: wkasper
Date: Wed Nov 28 09:50:23 2012
New Revision: 1414617
URL: http://svn.apache.org/viewvc?rev=1414617&view=rev
Log:
STANBOL-770: Replace JTidy Html Parser by JSoup Parser
Added:
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMBuilder.java
Modified:
stanbol/trunk/enhancer/engines/htmlextractor/pom.xml
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMUtils.java
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/HtmlParser.java
stanbol/trunk/enhancer/engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java
Modified: stanbol/trunk/enhancer/engines/htmlextractor/pom.xml
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/htmlextractor/pom.xml?rev=1414617&r1=1414616&r2=1414617&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/htmlextractor/pom.xml (original)
+++ stanbol/trunk/enhancer/engines/htmlextractor/pom.xml Wed Nov 28 09:50:23
2012
@@ -62,7 +62,7 @@
org.apache.stanbol.enhancer.engines.htmlextractor.*
</Private-Package>
<Embed-Dependency>
- jtidy;scope=compile
+ jsoup;scope=compile
</Embed-Dependency>
</instructions>
</configuration>
@@ -137,15 +137,15 @@
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
-
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
- </dependency>
+ </dependency>
<dependency>
- <groupId>net.sf.jtidy</groupId>
- <artifactId>jtidy</artifactId>
- <version>r938</version>
- </dependency>
+ <groupId>org.jsoup</groupId>
+ <artifactId>jsoup</artifactId>
+ <version>1.7.1</version>
+ </dependency>
</dependencies>
+
</project>
Added:
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMBuilder.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMBuilder.java?rev=1414617&view=auto
==============================================================================
---
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMBuilder.java
(added)
+++
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMBuilder.java
Wed Nov 28 09:50:23 2012
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.htmlextractor.impl;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+
+/**
+ *
+ * @author <a href="mailto:[email protected]">Walter Kasper</a>
+ *
+ */
+
+public class DOMBuilder {
+ /**
+ * Returns a W3C DOM that exposes the same content as the supplied Jsoup
document into a W3C DOM.
+ * @param jsoupDocument The Jsoup document to convert.
+ * @return A W3C Document.
+ */
+ public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) {
+
+ Document document = null;
+
+ try {
+
+ /* Obtain the document builder for the configured XML parser. */
+ DocumentBuilderFactory docBuilderFactory =
DocumentBuilderFactory.newInstance();
+ DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
+
+ /* Create a document to contain the content. */
+ document = docBuilder.newDocument();
+ createDOM(jsoupDocument, document, document, new
HashMap<String,String>());
+
+ } catch (ParserConfigurationException pce) {
+ throw new RuntimeException(pce);
+ }
+
+ return document;
+ }
+
+ /**
+ * The internal helper that copies content from the specified Jsoup
<tt>Node</tt> into a W3C {@link Node}.
+ * @param node The Jsoup node containing the content to copy to the
specified W3C {@link Node}.
+ * @param out The W3C {@link Node} that receives the DOM content.
+ */
+ private static void createDOM(org.jsoup.nodes.Node node, Node out, Document
doc, Map<String,String> ns) {
+
+ if (node instanceof org.jsoup.nodes.Document) {
+
+ org.jsoup.nodes.Document d = ((org.jsoup.nodes.Document) node);
+ for (org.jsoup.nodes.Node n : d.childNodes()) {
+ createDOM(n, out,doc,ns);
+ }
+
+ } else if (node instanceof org.jsoup.nodes.Element) {
+
+ org.jsoup.nodes.Element e = ((org.jsoup.nodes.Element) node);
+ org.w3c.dom.Element _e = doc.createElement(e.tagName());
+ out.appendChild(_e);
+ org.jsoup.nodes.Attributes atts = e.attributes();
+
+ for(org.jsoup.nodes.Attribute a : atts){
+ String attName = a.getKey();
+ //omit xhtml namespace
+ if (attName.equals("xmlns")) {
+ continue;
+ }
+ String attPrefix = getNSPrefix(attName);
+ if (attPrefix != null) {
+ if (attPrefix.equals("xmlns")) {
+ ns.put(getLocalName(attName), a.getValue());
+ }
+ else if (!attPrefix.equals("xml")) {
+ String namespace = ns.get(attPrefix);
+ if (namespace == null) {
+ //fix attribute names looking like qnames
+ attName = attName.replace(':','_');
+ }
+ }
+ }
+ _e.setAttribute(attName, a.getValue());
+ }
+
+ for (org.jsoup.nodes.Node n : e.childNodes()) {
+ createDOM(n, _e, doc,ns);
+ }
+
+ } else if (node instanceof org.jsoup.nodes.TextNode) {
+
+ org.jsoup.nodes.TextNode t = ((org.jsoup.nodes.TextNode) node);
+ if (!(out instanceof Document)) {
+ out.appendChild(doc.createTextNode(t.text()));
+ }
+ }
+ }
+
+ // some hacks for handling namespace in jsoup2DOM conversion
+ private static String getNSPrefix(String name) {
+ if (name != null) {
+ int pos = name.indexOf(':');
+ if (pos > 0) {
+ return name.substring(0,pos);
+ }
+ }
+ return null;
+ }
+
+ private static String getLocalName(String name) {
+ if (name != null) {
+ int pos = name.lastIndexOf(':');
+ if (pos > 0) {
+ return name.substring(pos+1);
+ }
+ }
+ return name;
+ }
+
+}
Modified:
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMUtils.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMUtils.java?rev=1414617&r1=1414616&r2=1414617&view=diff
==============================================================================
---
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMUtils.java
(original)
+++
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMUtils.java
Wed Nov 28 09:50:23 2012
@@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
+import java.io.PrintStream;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
@@ -57,6 +58,85 @@ import org.xml.sax.SAXParseException;
*/
public final class DOMUtils {
+ /**
+ * This prints the specified node and all of its children to a PrintStream.
+ *
+ * @param node a DOM <code>Node</code>
+ */
+ public static void printDOM(Node node, PrintStream out) {
+
+ int type = node.getNodeType();
+ switch (type) {
+ // print the document element
+ case Node.DOCUMENT_NODE:
+ out.println("<?xml version=\"1.0\" ?>");
+ printDOM(((Document)node).getDocumentElement(),out);
+ break;
+
+ // print element with attributes
+ case Node.ELEMENT_NODE:
+ out.print("<");
+ out.print(node.getNodeName());
+ NamedNodeMap attrs = node.getAttributes();
+ for (int i = 0; i < attrs.getLength(); i++) {
+ Node attr = attrs.item(i);
+ out.print(" " + attr.getNodeName().trim() + "=\""
+ + quoteXMLChars(attr.getNodeValue().trim()) + "\"");
+ }
+ out.println(">");
+
+ NodeList children = node.getChildNodes();
+ if (children != null) {
+ int len = children.getLength();
+ for (int i = 0; i < len; i++) {
+ printDOM(children.item(i),out);
+ }
+ }
+
+ break;
+
+ // handle entity reference nodes
+ case Node.ENTITY_REFERENCE_NODE:
+ out.print("&");
+ out.print(node.getNodeName().trim());
+ out.print(";");
+ break;
+
+ // print cdata sections
+ case Node.CDATA_SECTION_NODE:
+ out.print("<![CDATA[");
+ out.print(node.getNodeValue().trim());
+ out.print("]]>");
+ break;
+
+ // print text
+ case Node.TEXT_NODE:
+ out.print(quoteXMLChars(node.getNodeValue().trim()));
+ break;
+
+ // print processing instruction
+ case Node.PROCESSING_INSTRUCTION_NODE:
+ out.print("<?");
+ out.print(node.getNodeName().trim());
+ String data = node.getNodeValue().trim();
+ out.print(" ");
+ out.print(data);
+ out.print("?>");
+ break;
+
+ default:
+ System.err.println("unknown type " + type);
+ break;
+ }
+
+ if (type == Node.ELEMENT_NODE) {
+ out.println();
+ out.print("</");
+ out.print(node.getNodeName().trim());
+ out.println('>');
+ }
+ }
+
/**
* This prints the given DOM document to System.out with indentation and
* utf-8 encoding.
@@ -497,5 +577,13 @@ public final class DOMUtils {
ele.appendChild(child);
return child;
}
-
+
+ public static String quoteXMLChars(String text) {
+ if (text != null) {
+ return text.replace("&", "&").replace("<","<").replace(">",
">").replace("\"", """).replace("'", "'");
+ }
+ return text;
+ }
+
}
+
Modified:
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/HtmlParser.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/HtmlParser.java?rev=1414617&r1=1414616&r2=1414617&view=diff
==============================================================================
---
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/HtmlParser.java
(original)
+++
stanbol/trunk/enhancer/engines/htmlextractor/src/main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/HtmlParser.java
Wed Nov 28 09:50:23 2012
@@ -17,15 +17,17 @@
package org.apache.stanbol.enhancer.engines.htmlextractor.impl;
import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.OutputStream;
-import javax.xml.parsers.ParserConfigurationException;
-
+import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
-import org.w3c.tidy.Tidy;
/**
* HtmlParser.java
@@ -40,36 +42,65 @@ public class HtmlParser {
*/
private static final Logger LOG =
LoggerFactory.getLogger(HtmlParser.class);
- private Tidy htmlToXmlParser;
-
+ private String baseURI = "";
+
public HtmlParser() {
- this.htmlToXmlParser = new Tidy();
- this.htmlToXmlParser.setTidyMark(false);
- this.htmlToXmlParser.setDropEmptyParas(true);
- this.htmlToXmlParser.setQuiet(true);
- this.htmlToXmlParser.setQuoteAmpersand(true);
- this.htmlToXmlParser.setShowWarnings(false);
- this.htmlToXmlParser.setShowErrors(0);
- this.htmlToXmlParser.setNumEntities(true);
- this.htmlToXmlParser.setHideComments(true);
- this.htmlToXmlParser.setOutputEncoding("UTF-8");
- this.htmlToXmlParser.setXmlOut(true);
}
- public Document getDOM(String html) {
- if (html != null) {
- return getDOM(new ByteArrayInputStream(html.getBytes()), null);
- }
- return null;
+ /**
+ * @return the baseURI
+ */
+ public String getBaseURI() {
+ return baseURI;
+ }
+
+
+ /**
+ * @param baseURI the baseURI to set
+ */
+ public void setBaseURI(String baseURI) {
+ this.baseURI = baseURI;
}
- public synchronized Document getDOM(InputStream html, String charset) {
- if (charset != null) {
- htmlToXmlParser.setInputEncoding(charset);
+ public Document getDOM(String html) {
+ if (html != null) {
+ return getDOM(new ByteArrayInputStream(html.getBytes()), null);
+ }
+ return null;
+ }
+
+ public Document getDOM(InputStream html, String charset) {
+ Document doc = null;
+ try {
+ doc = DOMBuilder.jsoup2DOM(Jsoup.parse(html, charset, baseURI));
+ } catch (RuntimeException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
}
- Document doc = htmlToXmlParser.parseDOM(html, null);
return doc;
}
+
+ public static void main(String[] args) throws Exception {
+ int argv = 0;
+ String encoding = null;
+ while (argv < args.length && args[argv].startsWith("-")) {
+ if (args[argv].equals("-enc")) {
+ encoding = args[++argv];
+ }
+ ++argv;
+ }
+ HtmlParser parser = new HtmlParser();
+ for (int i = argv; i < args.length; ++i) {
+// parser.setBaseURI(new File(args[i]).toURI().toString());
+ InputStream is = new FileInputStream(args[i]);
+ Document doc = parser.getDOM(is,encoding);
+ OutputStream out = new FileOutputStream(new
File(args[i]).getName()+".xml");
+ DOMUtils.writeXml(doc,"UTF-8",null,out);
+ out.close();
+ is.close();
+ }
+ }
}
Modified:
stanbol/trunk/enhancer/engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java?rev=1414617&r1=1414616&r2=1414617&view=diff
==============================================================================
---
stanbol/trunk/enhancer/engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java
(original)
+++
stanbol/trunk/enhancer/engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java
Wed Nov 28 09:50:23 2012
@@ -135,7 +135,7 @@ public class TestHtmlExtractor {
// show triples
int tripleCounter = model.size();
- LOG.info("Triples: {}",tripleCounter);
+ LOG.debug("Triples: {}",tripleCounter);
printTriples(model);
Set<NonLiteral> roots = ClerezzaRDFUtils.findRoots(model);
assertTrue(roots.size() > 1);