Author: cziegeler
Date: Wed Nov 25 12:28:19 2009
New Revision: 884067
URL: http://svn.apache.org/viewvc?rev=884067&view=rev
Log:
SLING-1203 : Use tagsoup html parser instead of nekohtml
Added:
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
(with props)
Removed:
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/NekohtmlDomParser.java
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/NekohtmlSaxParser.java
Modified:
sling/trunk/contrib/commons/html/NOTICE
sling/trunk/contrib/commons/html/pom.xml
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java
Modified: sling/trunk/contrib/commons/html/NOTICE
URL:
http://svn.apache.org/viewvc/sling/trunk/contrib/commons/html/NOTICE?rev=884067&r1=884066&r2=884067&view=diff
==============================================================================
--- sling/trunk/contrib/commons/html/NOTICE (original)
+++ sling/trunk/contrib/commons/html/NOTICE Wed Nov 25 12:28:19 2009
@@ -7,3 +7,5 @@
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
+This product includes software developed at
+http://home.ccil.org/~cowan/XML/tagsoup/
\ No newline at end of file
Modified: sling/trunk/contrib/commons/html/pom.xml
URL:
http://svn.apache.org/viewvc/sling/trunk/contrib/commons/html/pom.xml?rev=884067&r1=884066&r2=884067&view=diff
==============================================================================
--- sling/trunk/contrib/commons/html/pom.xml (original)
+++ sling/trunk/contrib/commons/html/pom.xml Wed Nov 25 12:28:19 2009
@@ -54,19 +54,13 @@
<extensions>true</extensions>
<configuration>
<instructions>
- <Import-Package>
- !sun.io,*
- </Import-Package>
<Export-Package>
org.apache.sling.commons.html
</Export-Package>
<Private-Package>
- !org.cyberneko.dtd.ant,
org.apache.sling.commons.html.impl,
- org.cyberneko.*,
-
org.apache.xerces.parsers.*,org.apache.xerces.xni.parser.*,
- org.apache.html.dom, org.apache.wml,
org.apache.xerces.*,
- org.apache.wml.dom, org.apache.xml.serialize,
org.apache.xml.resolver.*
+ org.ccil.cowan.tagsoup,
+ org.ccil.cowan.tagsoup.jaxp
</Private-Package>
</instructions>
</configuration>
@@ -89,29 +83,8 @@
<dependencies>
<dependency>
- <groupId>xml-apis</groupId>
- <artifactId>xml-apis</artifactId>
- <version>1.3.04</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>nekohtml</groupId>
- <artifactId>nekohtml</artifactId>
- <version>0.9.5</version>
- </dependency>
- <dependency>
- <groupId>nekohtml</groupId>
- <artifactId>nekodtd</artifactId>
- <version>0.1.11</version>
- </dependency>
- <dependency>
- <groupId>xerces</groupId>
- <artifactId>xercesImpl</artifactId>
- <version>2.8.1</version>
- </dependency>
- <dependency>
- <groupId>xml-resolver</groupId>
- <artifactId>xml-resolver</artifactId>
+ <groupId>org.ccil.cowan.tagsoup</groupId>
+ <artifactId>tagsoup</artifactId>
<version>1.2</version>
</dependency>
</dependencies>
Added:
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
URL:
http://svn.apache.org/viewvc/sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java?rev=884067&view=auto
==============================================================================
---
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
(added)
+++
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
Wed Nov 25 12:28:19 2009
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sling.commons.html.impl;
+
+import java.io.IOException;
+
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMResult;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.ext.LexicalHandler;
+
+/**
+ * The <code>DOMBuilder</code> is a utility class that will generate a W3C
+ * DOM Document from SAX events.
+ *
+ */
+public class DOMBuilder implements ContentHandler, LexicalHandler {
+
+ /** The default transformer factory shared by all instances */
+ private static final SAXTransformerFactory FACTORY =
(SAXTransformerFactory) TransformerFactory.newInstance();
+
+ /** The result */
+ private final DOMResult result;
+
+ private final ContentHandler contentHandler;
+ private final LexicalHandler lexicalHandler;
+
+ /**
+ * Construct a new instance of this DOMBuilder.
+ */
+ public DOMBuilder() throws IOException {
+ try {
+ final TransformerHandler handler = FACTORY.newTransformerHandler();
+ this.contentHandler = handler;
+ this.lexicalHandler = handler;
+ this.result = new DOMResult();
+ handler.setResult(this.result);
+ } catch (javax.xml.transform.TransformerException local) {
+ throw (IOException) new IOException("Fatal-Error: Unable to get
transformer handler").initCause(local);
+ }
+ }
+
+ /**
+ * Return the newly built Document.
+ */
+ public Document getDocument() {
+ if (this.result.getNode() == null) {
+ return null;
+ } else if (this.result.getNode().getNodeType() == Node.DOCUMENT_NODE) {
+ return (Document) this.result.getNode();
+ } else {
+ return this.result.getNode().getOwnerDocument();
+ }
+ }
+
+ public void setDocumentLocator(Locator locator) {
+ contentHandler.setDocumentLocator(locator);
+ }
+
+ public void startDocument()
+ throws SAXException {
+ contentHandler.startDocument();
+ }
+
+ public void endDocument()
+ throws SAXException {
+ contentHandler.endDocument();
+ }
+
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ contentHandler.startPrefixMapping(prefix, uri);
+ }
+
+ public void endPrefixMapping(String prefix)
+ throws SAXException {
+ contentHandler.endPrefixMapping(prefix);
+ }
+
+ public void startElement(String uri, String loc, String raw, Attributes a)
+ throws SAXException {
+ contentHandler.startElement(uri, loc, raw, a);
+ }
+
+ public void endElement(String uri, String loc, String raw)
+ throws SAXException {
+ contentHandler.endElement(uri, loc, raw);
+ }
+
+ public void characters(char c[], int start, int len)
+ throws SAXException {
+ contentHandler.characters(c, start, len);
+ }
+
+ public void ignorableWhitespace(char c[], int start, int len)
+ throws SAXException {
+ contentHandler.ignorableWhitespace(c, start, len);
+ }
+
+ public void processingInstruction(String target, String data)
+ throws SAXException {
+ contentHandler.processingInstruction(target, data);
+ }
+
+ public void skippedEntity(String name)
+ throws SAXException {
+ contentHandler.skippedEntity(name);
+ }
+
+ public void startDTD(String name, String publicId, String systemId)
+ throws SAXException {
+ lexicalHandler.startDTD(name, publicId, systemId);
+ }
+
+ public void endDTD()
+ throws SAXException {
+ lexicalHandler.endDTD();
+ }
+
+ public void startEntity(String name)
+ throws SAXException {
+ lexicalHandler.startEntity(name);
+ }
+
+ public void endEntity(String name)
+ throws SAXException {
+ lexicalHandler.endEntity(name);
+ }
+
+ public void startCDATA()
+ throws SAXException {
+ lexicalHandler.startCDATA();
+ }
+
+ public void endCDATA()
+ throws SAXException {
+ lexicalHandler.endCDATA();
+ }
+
+ public void comment(char ch[], int start, int len)
+ throws SAXException {
+ lexicalHandler.comment(ch, start, len);
+ }
+}
Propchange:
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
------------------------------------------------------------------------------
svn:keywords = author date id revision rev url
Propchange:
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java
URL:
http://svn.apache.org/viewvc/sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java?rev=884067&r1=884066&r2=884067&view=diff
==============================================================================
---
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java
(original)
+++
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java
Wed Nov 25 12:28:19 2009
@@ -22,9 +22,12 @@
import java.io.InputStream;
import org.apache.sling.commons.html.HtmlParser;
+import org.ccil.cowan.tagsoup.Parser;
import org.w3c.dom.Document;
import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
+import org.xml.sax.ext.LexicalHandler;
/**
* @scr.component metatype="false"
@@ -38,13 +41,42 @@
*/
public void parse(InputStream stream, String encoding, ContentHandler ch)
throws SAXException {
- NekohtmlSaxParser.parse(stream, encoding, ch);
+ final Parser parser = new Parser();
+ if ( ch instanceof LexicalHandler ) {
+
parser.setProperty("http://xml.org/sax/properties/lexical-handler", ch);
+ }
+ parser.setContentHandler(ch);
+ final InputSource source = new InputSource(stream);
+ source.setEncoding(encoding);
+ try {
+ parser.parse(source);
+ } catch (IOException ioe) {
+ throw new SAXException(ioe);
+ }
}
/**
* @see org.apache.sling.commons.html.HtmlParser#parse(java.lang.String,
java.io.InputStream, java.lang.String)
*/
public Document parse(String systemId, InputStream stream, String
encoding) throws IOException {
- return NekohtmlDomParser.parse(systemId, stream, encoding);
+ final Parser parser = new Parser();
+
+ final DOMBuilder builder = new DOMBuilder();
+
+ final InputSource source = new InputSource(stream);
+ source.setEncoding(encoding);
+ source.setSystemId(systemId);
+
+ try {
+
parser.setProperty("http://xml.org/sax/properties/lexical-handler", builder);
+ parser.setContentHandler(builder);
+ parser.parse(source);
+ } catch (SAXException se) {
+ if ( se.getCause() instanceof IOException ) {
+ throw (IOException) se.getCause();
+ }
+ throw (IOException) new IOException("Unable to parse
xml.").initCause(se);
+ }
+ return builder.getDocument();
}
}