Author: cziegeler
Date: Wed Nov 25 12:28:19 2009
New Revision: 884067

URL: http://svn.apache.org/viewvc?rev=884067&view=rev
Log:
SLING-1203 : Use tagsoup html parser instead of nekohtml

Added:
    
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
   (with props)
Removed:
    
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/NekohtmlDomParser.java
    
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/NekohtmlSaxParser.java
Modified:
    sling/trunk/contrib/commons/html/NOTICE
    sling/trunk/contrib/commons/html/pom.xml
    
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java

Modified: sling/trunk/contrib/commons/html/NOTICE
URL: 
http://svn.apache.org/viewvc/sling/trunk/contrib/commons/html/NOTICE?rev=884067&r1=884066&r2=884067&view=diff
==============================================================================
--- sling/trunk/contrib/commons/html/NOTICE (original)
+++ sling/trunk/contrib/commons/html/NOTICE Wed Nov 25 12:28:19 2009
@@ -7,3 +7,5 @@
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
 
+This product includes software developed at
+http://home.ccil.org/~cowan/XML/tagsoup/
\ No newline at end of file

Modified: sling/trunk/contrib/commons/html/pom.xml
URL: 
http://svn.apache.org/viewvc/sling/trunk/contrib/commons/html/pom.xml?rev=884067&r1=884066&r2=884067&view=diff
==============================================================================
--- sling/trunk/contrib/commons/html/pom.xml (original)
+++ sling/trunk/contrib/commons/html/pom.xml Wed Nov 25 12:28:19 2009
@@ -54,19 +54,13 @@
                 <extensions>true</extensions>
                 <configuration>
                     <instructions>
-                        <Import-Package>
-                            !sun.io,*
-                        </Import-Package>
                         <Export-Package>
                             org.apache.sling.commons.html
                         </Export-Package>
                         <Private-Package>
-                            !org.cyberneko.dtd.ant,
                             org.apache.sling.commons.html.impl,
-                            org.cyberneko.*,
-                            
org.apache.xerces.parsers.*,org.apache.xerces.xni.parser.*,
-                            org.apache.html.dom, org.apache.wml, 
org.apache.xerces.*,
-                            org.apache.wml.dom, org.apache.xml.serialize, 
org.apache.xml.resolver.*
+                            org.ccil.cowan.tagsoup,
+                            org.ccil.cowan.tagsoup.jaxp
                         </Private-Package>
                     </instructions>
                 </configuration>
@@ -89,29 +83,8 @@
 
     <dependencies>
         <dependency>
-            <groupId>xml-apis</groupId>
-            <artifactId>xml-apis</artifactId>
-            <version>1.3.04</version> 
-            <scope>provided</scope>
-        </dependency>
-        <dependency>
-            <groupId>nekohtml</groupId>
-            <artifactId>nekohtml</artifactId>
-            <version>0.9.5</version>
-        </dependency>
-        <dependency>
-            <groupId>nekohtml</groupId>
-            <artifactId>nekodtd</artifactId>
-            <version>0.1.11</version>    
-        </dependency>
-        <dependency>
-            <groupId>xerces</groupId>
-            <artifactId>xercesImpl</artifactId>
-            <version>2.8.1</version>
-        </dependency>
-        <dependency>
-            <groupId>xml-resolver</groupId>
-            <artifactId>xml-resolver</artifactId>
+            <groupId>org.ccil.cowan.tagsoup</groupId>
+            <artifactId>tagsoup</artifactId>
             <version>1.2</version>
         </dependency>
     </dependencies>

Added: 
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
URL: 
http://svn.apache.org/viewvc/sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java?rev=884067&view=auto
==============================================================================
--- 
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
 (added)
+++ 
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
 Wed Nov 25 12:28:19 2009
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sling.commons.html.impl;
+
+import java.io.IOException;
+
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMResult;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.ext.LexicalHandler;
+
+/**
+ * The <code>DOMBuilder</code> is a utility class that will generate a W3C
+ * DOM Document from SAX events.
+ *
+ */
+public class DOMBuilder implements ContentHandler, LexicalHandler {
+
+    /** The default transformer factory shared by all instances */
+    private static final SAXTransformerFactory FACTORY = 
(SAXTransformerFactory) TransformerFactory.newInstance();
+
+    /** The result */
+    private final DOMResult result;
+
+    private final ContentHandler contentHandler;
+    private final LexicalHandler lexicalHandler;
+
+    /**
+     * Construct a new instance of this DOMBuilder.
+     */
+    public DOMBuilder() throws IOException {
+        try {
+            final TransformerHandler handler = FACTORY.newTransformerHandler();
+            this.contentHandler = handler;
+            this.lexicalHandler = handler;
+            this.result = new DOMResult();
+            handler.setResult(this.result);
+        } catch (javax.xml.transform.TransformerException local) {
+            throw (IOException) new IOException("Fatal-Error: Unable to get 
transformer handler").initCause(local);
+        }
+    }
+
+    /**
+     * Return the newly built Document.
+     */
+    public Document getDocument() {
+        if (this.result.getNode() == null) {
+            return null;
+        } else if (this.result.getNode().getNodeType() == Node.DOCUMENT_NODE) {
+            return (Document) this.result.getNode();
+        } else {
+            return this.result.getNode().getOwnerDocument();
+        }
+    }
+
+    public void setDocumentLocator(Locator locator) {
+        contentHandler.setDocumentLocator(locator);
+    }
+
+    public void startDocument()
+    throws SAXException {
+        contentHandler.startDocument();
+    }
+
+    public void endDocument()
+    throws SAXException {
+        contentHandler.endDocument();
+    }
+
+    public void startPrefixMapping(String prefix, String uri)
+    throws SAXException {
+        contentHandler.startPrefixMapping(prefix, uri);
+    }
+
+    public void endPrefixMapping(String prefix)
+    throws SAXException {
+        contentHandler.endPrefixMapping(prefix);
+    }
+
+    public void startElement(String uri, String loc, String raw, Attributes a)
+    throws SAXException {
+        contentHandler.startElement(uri, loc, raw, a);
+    }
+
+    public void endElement(String uri, String loc, String raw)
+    throws SAXException {
+        contentHandler.endElement(uri, loc, raw);
+    }
+
+    public void characters(char c[], int start, int len)
+    throws SAXException {
+        contentHandler.characters(c, start, len);
+    }
+
+    public void ignorableWhitespace(char c[], int start, int len)
+    throws SAXException {
+        contentHandler.ignorableWhitespace(c, start, len);
+    }
+
+    public void processingInstruction(String target, String data)
+    throws SAXException {
+        contentHandler.processingInstruction(target, data);
+    }
+
+    public void skippedEntity(String name)
+    throws SAXException {
+        contentHandler.skippedEntity(name);
+    }
+
+    public void startDTD(String name, String publicId, String systemId)
+    throws SAXException {
+        lexicalHandler.startDTD(name, publicId, systemId);
+    }
+
+    public void endDTD()
+    throws SAXException {
+        lexicalHandler.endDTD();
+    }
+
+    public void startEntity(String name)
+    throws SAXException {
+        lexicalHandler.startEntity(name);
+    }
+
+    public void endEntity(String name)
+    throws SAXException {
+        lexicalHandler.endEntity(name);
+    }
+
+    public void startCDATA()
+    throws SAXException {
+        lexicalHandler.startCDATA();
+    }
+
+    public void endCDATA()
+    throws SAXException {
+        lexicalHandler.endCDATA();
+    }
+
+    public void comment(char ch[], int start, int len)
+    throws SAXException {
+        lexicalHandler.comment(ch, start, len);
+    }
+}

Propchange: 
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
------------------------------------------------------------------------------
    svn:keywords = author date id revision rev url

Propchange: 
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: 
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java
URL: 
http://svn.apache.org/viewvc/sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java?rev=884067&r1=884066&r2=884067&view=diff
==============================================================================
--- 
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java
 (original)
+++ 
sling/trunk/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java
 Wed Nov 25 12:28:19 2009
@@ -22,9 +22,12 @@
 import java.io.InputStream;
 
 import org.apache.sling.commons.html.HtmlParser;
+import org.ccil.cowan.tagsoup.Parser;
 import org.w3c.dom.Document;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
+import org.xml.sax.ext.LexicalHandler;
 
 /**
  * @scr.component metatype="false"
@@ -38,13 +41,42 @@
      */
     public void parse(InputStream stream, String encoding, ContentHandler ch)
     throws SAXException {
-        NekohtmlSaxParser.parse(stream, encoding, ch);
+        final Parser parser = new Parser();
+        if ( ch instanceof LexicalHandler ) {
+            
parser.setProperty("http://xml.org/sax/properties/lexical-handler";, ch);
+        }
+        parser.setContentHandler(ch);
+        final InputSource source = new InputSource(stream);
+        source.setEncoding(encoding);
+        try {
+            parser.parse(source);
+        } catch (IOException ioe) {
+            throw new SAXException(ioe);
+        }
     }
 
     /**
      * @see org.apache.sling.commons.html.HtmlParser#parse(java.lang.String, 
java.io.InputStream, java.lang.String)
      */
     public Document parse(String systemId, InputStream stream, String 
encoding) throws IOException {
-        return NekohtmlDomParser.parse(systemId, stream, encoding);
+        final Parser parser = new Parser();
+
+        final DOMBuilder builder = new DOMBuilder();
+
+        final InputSource source = new InputSource(stream);
+        source.setEncoding(encoding);
+        source.setSystemId(systemId);
+
+        try {
+            
parser.setProperty("http://xml.org/sax/properties/lexical-handler";, builder);
+            parser.setContentHandler(builder);
+            parser.parse(source);
+        } catch (SAXException se) {
+            if ( se.getCause() instanceof IOException ) {
+                throw (IOException) se.getCause();
+            }
+            throw (IOException) new IOException("Unable to parse 
xml.").initCause(se);
+        }
+        return builder.getDocument();
     }
 }


Reply via email to