This is an automated email from the ASF dual-hosted git repository. rombert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/sling-org-apache-sling-commons-html.git
commit 41222cd82995a41cccbe698db623fed00bd7c139 Author: Carsten Ziegeler <[email protected]> AuthorDate: Wed Nov 25 12:28:19 2009 +0000 SLING-1203 : Use tagsoup html parser instead of nekohtml git-svn-id: https://svn.apache.org/repos/asf/sling/trunk@884067 13f79535-47bb-0310-9956-ffa450edef68 --- NOTICE | 2 + pom.xml | 35 +---- .../apache/sling/commons/html/impl/DOMBuilder.java | 166 +++++++++++++++++++++ .../sling/commons/html/impl/HtmlParserImpl.java | 36 ++++- .../sling/commons/html/impl/NekohtmlDomParser.java | 62 -------- .../sling/commons/html/impl/NekohtmlSaxParser.java | 73 --------- 6 files changed, 206 insertions(+), 168 deletions(-) diff --git a/NOTICE b/NOTICE index 9e87228..be0c7d1 100644 --- a/NOTICE +++ b/NOTICE @@ -7,3 +7,5 @@ by Day Software (http://www.day.com/). This product includes software developed at The Apache Software Foundation (http://www.apache.org/). +This product includes software developed at +http://home.ccil.org/~cowan/XML/tagsoup/ \ No newline at end of file diff --git a/pom.xml b/pom.xml index 1e9bcad..4f6b247 100644 --- a/pom.xml +++ b/pom.xml @@ -54,19 +54,13 @@ <extensions>true</extensions> <configuration> <instructions> - <Import-Package> - !sun.io,* - </Import-Package> <Export-Package> org.apache.sling.commons.html </Export-Package> <Private-Package> - !org.cyberneko.dtd.ant, org.apache.sling.commons.html.impl, - org.cyberneko.*, - org.apache.xerces.parsers.*,org.apache.xerces.xni.parser.*, - org.apache.html.dom, org.apache.wml, org.apache.xerces.*, - org.apache.wml.dom, org.apache.xml.serialize, org.apache.xml.resolver.* + org.ccil.cowan.tagsoup, + org.ccil.cowan.tagsoup.jaxp </Private-Package> </instructions> </configuration> @@ -89,29 +83,8 @@ <dependencies> <dependency> - <groupId>xml-apis</groupId> - <artifactId>xml-apis</artifactId> - <version>1.3.04</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>nekohtml</groupId> - <artifactId>nekohtml</artifactId> - <version>0.9.5</version> - </dependency> - <dependency> - <groupId>nekohtml</groupId> - <artifactId>nekodtd</artifactId> - <version>0.1.11</version> - </dependency> - <dependency> - <groupId>xerces</groupId> - <artifactId>xercesImpl</artifactId> - <version>2.8.1</version> - </dependency> - <dependency> - <groupId>xml-resolver</groupId> - <artifactId>xml-resolver</artifactId> + <groupId>org.ccil.cowan.tagsoup</groupId> + <artifactId>tagsoup</artifactId> <version>1.2</version> </dependency> </dependencies> diff --git a/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java b/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java new file mode 100644 index 0000000..375ae9d --- /dev/null +++ b/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sling.commons.html.impl; + +import java.io.IOException; + +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMResult; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TransformerHandler; + +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.ext.LexicalHandler; + +/** + * The <code>DOMBuilder</code> is a utility class that will generate a W3C + * DOM Document from SAX events. + * + */ +public class DOMBuilder implements ContentHandler, LexicalHandler { + + /** The default transformer factory shared by all instances */ + private static final SAXTransformerFactory FACTORY = (SAXTransformerFactory) TransformerFactory.newInstance(); + + /** The result */ + private final DOMResult result; + + private final ContentHandler contentHandler; + private final LexicalHandler lexicalHandler; + + /** + * Construct a new instance of this DOMBuilder. + */ + public DOMBuilder() throws IOException { + try { + final TransformerHandler handler = FACTORY.newTransformerHandler(); + this.contentHandler = handler; + this.lexicalHandler = handler; + this.result = new DOMResult(); + handler.setResult(this.result); + } catch (javax.xml.transform.TransformerException local) { + throw (IOException) new IOException("Fatal-Error: Unable to get transformer handler").initCause(local); + } + } + + /** + * Return the newly built Document. + */ + public Document getDocument() { + if (this.result.getNode() == null) { + return null; + } else if (this.result.getNode().getNodeType() == Node.DOCUMENT_NODE) { + return (Document) this.result.getNode(); + } else { + return this.result.getNode().getOwnerDocument(); + } + } + + public void setDocumentLocator(Locator locator) { + contentHandler.setDocumentLocator(locator); + } + + public void startDocument() + throws SAXException { + contentHandler.startDocument(); + } + + public void endDocument() + throws SAXException { + contentHandler.endDocument(); + } + + public void startPrefixMapping(String prefix, String uri) + throws SAXException { + contentHandler.startPrefixMapping(prefix, uri); + } + + public void endPrefixMapping(String prefix) + throws SAXException { + contentHandler.endPrefixMapping(prefix); + } + + public void startElement(String uri, String loc, String raw, Attributes a) + throws SAXException { + contentHandler.startElement(uri, loc, raw, a); + } + + public void endElement(String uri, String loc, String raw) + throws SAXException { + contentHandler.endElement(uri, loc, raw); + } + + public void characters(char c[], int start, int len) + throws SAXException { + contentHandler.characters(c, start, len); + } + + public void ignorableWhitespace(char c[], int start, int len) + throws SAXException { + contentHandler.ignorableWhitespace(c, start, len); + } + + public void processingInstruction(String target, String data) + throws SAXException { + contentHandler.processingInstruction(target, data); + } + + public void skippedEntity(String name) + throws SAXException { + contentHandler.skippedEntity(name); + } + + public void startDTD(String name, String publicId, String systemId) + throws SAXException { + lexicalHandler.startDTD(name, publicId, systemId); + } + + public void endDTD() + throws SAXException { + lexicalHandler.endDTD(); + } + + public void startEntity(String name) + throws SAXException { + lexicalHandler.startEntity(name); + } + + public void endEntity(String name) + throws SAXException { + lexicalHandler.endEntity(name); + } + + public void startCDATA() + throws SAXException { + lexicalHandler.startCDATA(); + } + + public void endCDATA() + throws SAXException { + lexicalHandler.endCDATA(); + } + + public void comment(char ch[], int start, int len) + throws SAXException { + lexicalHandler.comment(ch, start, len); + } +} diff --git a/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java b/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java index cda543e..ed9a28d 100644 --- a/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java +++ b/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java @@ -22,9 +22,12 @@ import java.io.IOException; import java.io.InputStream; import org.apache.sling.commons.html.HtmlParser; +import org.ccil.cowan.tagsoup.Parser; import org.w3c.dom.Document; import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; import org.xml.sax.SAXException; +import org.xml.sax.ext.LexicalHandler; /** * @scr.component metatype="false" @@ -38,13 +41,42 @@ public class HtmlParserImpl implements HtmlParser { */ public void parse(InputStream stream, String encoding, ContentHandler ch) throws SAXException { - NekohtmlSaxParser.parse(stream, encoding, ch); + final Parser parser = new Parser(); + if ( ch instanceof LexicalHandler ) { + parser.setProperty("http://xml.org/sax/properties/lexical-handler", ch); + } + parser.setContentHandler(ch); + final InputSource source = new InputSource(stream); + source.setEncoding(encoding); + try { + parser.parse(source); + } catch (IOException ioe) { + throw new SAXException(ioe); + } } /** * @see org.apache.sling.commons.html.HtmlParser#parse(java.lang.String, java.io.InputStream, java.lang.String) */ public Document parse(String systemId, InputStream stream, String encoding) throws IOException { - return NekohtmlDomParser.parse(systemId, stream, encoding); + final Parser parser = new Parser(); + + final DOMBuilder builder = new DOMBuilder(); + + final InputSource source = new InputSource(stream); + source.setEncoding(encoding); + source.setSystemId(systemId); + + try { + parser.setProperty("http://xml.org/sax/properties/lexical-handler", builder); + parser.setContentHandler(builder); + parser.parse(source); + } catch (SAXException se) { + if ( se.getCause() instanceof IOException ) { + throw (IOException) se.getCause(); + } + throw (IOException) new IOException("Unable to parse xml.").initCause(se); + } + return builder.getDocument(); } } diff --git a/src/main/java/org/apache/sling/commons/html/impl/NekohtmlDomParser.java b/src/main/java/org/apache/sling/commons/html/impl/NekohtmlDomParser.java deleted file mode 100644 index 4bbe349..0000000 --- a/src/main/java/org/apache/sling/commons/html/impl/NekohtmlDomParser.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.sling.commons.html.impl; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Iterator; -import java.util.Properties; - -import org.apache.xerces.parsers.AbstractDOMParser; -import org.apache.xerces.xni.parser.XMLInputSource; -import org.cyberneko.html.HTMLConfiguration; -import org.w3c.dom.Document; - -/** - * DOM Parser based on the neko html parser. - */ -public class NekohtmlDomParser extends AbstractDOMParser { - - public NekohtmlDomParser(Properties properties) { - super(getConfig(properties)); - } - - protected static HTMLConfiguration getConfig(Properties properties) { - final HTMLConfiguration config = new HTMLConfiguration(); - config.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); - if (properties != null) { - for (Iterator<Object> i = properties.keySet().iterator(); i.hasNext();) { - final String name = i.next().toString(); - config.setProperty(name, properties.getProperty(name)); - } - } - return config; - } - - /** - * Parse html. - */ - public static Document parse(String systemId, InputStream stream, String encoding) - throws IOException { - final NekohtmlDomParser parser = new NekohtmlDomParser(null); - XMLInputSource source = new XMLInputSource(null, systemId, null, stream, encoding); - parser.parse(source); - return parser.getDocument(); - } -} diff --git a/src/main/java/org/apache/sling/commons/html/impl/NekohtmlSaxParser.java b/src/main/java/org/apache/sling/commons/html/impl/NekohtmlSaxParser.java deleted file mode 100644 index 5eba383..0000000 --- a/src/main/java/org/apache/sling/commons/html/impl/NekohtmlSaxParser.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.sling.commons.html.impl; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Iterator; -import java.util.Properties; - -import org.apache.xerces.parsers.AbstractSAXParser; -import org.cyberneko.html.HTMLConfiguration; -import org.xml.sax.ContentHandler; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; -import org.xml.sax.ext.LexicalHandler; - -/** - * SAX Parser based on the neko html parser. - */ -public class NekohtmlSaxParser extends AbstractSAXParser { - - public NekohtmlSaxParser(Properties properties) { - super(getConfig(properties)); - } - - protected static HTMLConfiguration getConfig(Properties properties) { - final HTMLConfiguration config = new HTMLConfiguration(); - config.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); - if (properties != null) { - for (Iterator<Object> i = properties.keySet().iterator(); i.hasNext();) { - final String name = i.next().toString(); - config.setProperty(name, properties.getProperty(name)); - } - } - return config; - } - - /** - * Parse html. - */ - public static void parse(InputStream stream, String encoding, ContentHandler ch) throws SAXException { - final NekohtmlSaxParser parser = new NekohtmlSaxParser(null); - parser.setContentHandler(ch); - if (ch instanceof LexicalHandler) { - parser.setLexicalHandler((LexicalHandler) ch); - } - final InputSource is = new InputSource(stream); - if ( encoding != null ) { - is.setEncoding(encoding); - } - try { - parser.parse(is); - } catch (IOException ioe) { - throw new SAXException("Error during parsing of html markup.", ioe); - } - } -} -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
