upayavira 2004/07/08 03:11:41
Modified: . gump.xml status.xml lib jars.xml Added: legal nekohtml.0.9.2.jar.license.txt src/blocks/html/conf html.xmap src/blocks/html/java/org/apache/cocoon/generation NekoHTMLGenerator.java src/blocks/html/lib nekohtml-0.9.2.jar Removed: src/blocks/html/conf tidy.xmap Log: Adding a NekoHTMLGenerator. I wondered whether to extend the existing HTMLGenerator to use Neko or JTidy, but decided on a new Generator. They could be merged if necessary. This is the first Jar I've committed. Can someone check I've done gump, licence, etc, correctly? Revision Changes Path 1.169 +7 -2 cocoon-2.1/gump.xml Index: gump.xml =================================================================== RCS file: /home/cvs/cocoon-2.1/gump.xml,v retrieving revision 1.168 retrieving revision 1.169 diff -u -r1.168 -r1.169 --- gump.xml 6 Jul 2004 15:31:38 -0000 1.168 +++ gump.xml 8 Jul 2004 10:11:38 -0000 1.169 @@ -470,7 +470,8 @@ <depend project="cocoon" inherit="all"/> <depend project="jtidy"/> - + <depend project="nekohtml"/> + <work nested="tools/anttasks"/> <home nested="build/cocoon-@@DATE@@"/> @@ -1271,4 +1272,8 @@ <jar name="lib/core/javacImpl-0.9.jar" id="impl"/> </project> + <project name="nekohtml"> + <package>org.cyberneko.html</package> + <jar name="src/blocks/html/lib/nekohtml-0.9.2.jar" id="nekohtml"/> + </project> </module> 1.389 +6 -1 cocoon-2.1/status.xml Index: status.xml =================================================================== RCS file: /home/cvs/cocoon-2.1/status.xml,v retrieving revision 1.388 retrieving revision 1.389 diff -u -r1.388 -r1.389 --- status.xml 8 Jul 2004 07:22:34 -0000 1.388 +++ status.xml 8 Jul 2004 10:11:38 -0000 1.389 @@ -204,6 +204,11 @@ <changes> <release version="@version@" date="@date@"> + <action dev="UV" type="add"> + Added a NekoHTMLGenerator to HTML block. This is a simpler HTML parser than + JTidy, which preserves more of the original HTML, primarily just balancing + closing tags. + </action> <action dev="TC" type="add" fixes-bug="29935" due-to="Leszek Gawron" due-to-email="[EMAIL PROTECTED]"> added support for stripping root elements in the CIncludeTransformer <action> 1.238 +9 -1 cocoon-2.1/lib/jars.xml Index: jars.xml =================================================================== RCS file: /home/cvs/cocoon-2.1/lib/jars.xml,v retrieving revision 1.237 retrieving revision 1.238 diff -u -r1.237 -r1.238 --- jars.xml 8 Jul 2004 09:17:31 -0000 1.237 +++ jars.xml 8 Jul 2004 10:11:40 -0000 1.238 @@ -571,6 +571,14 @@ </file> <file> + <title>Transform HTML to XML</title> + <description>NekoHTML is a lightweight HTML syntax correcter written using Xerces Native Interface.</description> + <used-by>NekoHTML generator (html block)</used-by> + <lib>html/lib/nekohtml-0.9.2.jar</lib> + <homepage>http://www.apache.org/~andyc/neko/</homepage> + </file> + + <file> <title>Search engine</title> <description> jakarta-lucene is a search engine toolkit designed for indexing and 1.1 cocoon-2.1/legal/nekohtml.0.9.2.jar.license.txt Index: nekohtml.0.9.2.jar.license.txt =================================================================== The CyberNeko Software License, Version 1.0 (C) Copyright 2002,2003, Andy Clark. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The end-user documentation included with the redistribution, if any, must include the following acknowledgment: "This product includes software developed by Andy Clark." Alternately, this acknowledgment may appear in the software itself, if and wherever such third-party acknowledgments normally appear. 4. The names "CyberNeko" and "NekoHTML" must not be used to endorse or promote products derived from this software without prior written permission. For written permission, please contact [EMAIL PROTECTED] 5. Products derived from this software may not be called "NekoHTML", nor may "NekoHTML" appear in their name, without prior written permission of the author. THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR OTHER CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ==================================================================== This license is based on the Apache Software License, version 1.1. 1.1 cocoon-2.1/src/blocks/html/conf/html.xmap Index: html.xmap =================================================================== <?xml version="1.0"?> <!-- Copyright 1999-2004 The Apache Software Foundation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <xmap xpath="/sitemap/components/generators" unless="[EMAIL PROTECTED]'html']"> <map:generator name="html" src="org.apache.cocoon.generation.HTMLGenerator" label="content"/> <map:generator name="nekohtml" src="org.apache.cocoon.generation.NekoHTMLGenerator" label="content"> <!-- Tidy configuration file. <jtidy-config>context://WEB-INF/tidy.properties</jtidy-config> --> </map:generator> </xmap> 1.1 cocoon-2.1/src/blocks/html/java/org/apache/cocoon/generation/NekoHTMLGenerator.java Index: NekoHTMLGenerator.java =================================================================== /* * Copyright 1999-2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cocoon.generation; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Iterator; import java.util.Map; import java.util.Properties; import javax.servlet.http.HttpServletRequest; import org.apache.avalon.framework.activity.Disposable; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.avalon.framework.parameters.Parameters; import org.apache.avalon.framework.service.ServiceException; import org.apache.avalon.framework.service.ServiceManager; import org.apache.cocoon.ProcessingException; import org.apache.cocoon.ResourceNotFoundException; import org.apache.cocoon.caching.CacheableProcessingComponent; import org.apache.cocoon.components.source.SourceUtil; import org.apache.cocoon.environment.ObjectModelHelper; import org.apache.cocoon.environment.Request; import org.apache.cocoon.environment.SourceResolver; import org.apache.cocoon.environment.http.HttpEnvironment; import org.apache.cocoon.util.PostInputStream; import org.apache.cocoon.xml.dom.DOMBuilder; import org.apache.cocoon.xml.dom.DOMStreamer; import org.apache.excalibur.source.Source; import org.apache.excalibur.source.SourceException; import org.apache.excalibur.source.SourceValidity; import org.apache.excalibur.xml.xpath.XPathProcessor; import org.apache.xerces.parsers.AbstractSAXParser; import org.cyberneko.html.HTMLConfiguration; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * @cocoon.sitemap.component.documentation * The neko html generator reads HTML from a source, converts it to XHTML * and generates SAX Events. It uses the NekoHTML library to do this. * * @cocoon.sitemap.component.name html * @cocoon.sitemap.component.label content * @cocoon.sitemap.component.logger sitemap.generator.nekohtml * @cocoon.sitemap.component.documentation.caching * Uses the last modification date of the xml document for validation * * @cocoon.sitemap.component.pooling.min 4 * @cocoon.sitemap.component.pooling.max 32 * @cocoon.sitemap.component.pooling.grow 4 * * @author <a href="mailto:[EMAIL PROTECTED]">Davanum Srinivas</a> * @author <a href="mailto:[EMAIL PROTECTED]">Carsten Ziegeler</a> * @author <a href="mailto:[EMAIL PROTECTED]">Nicola Ken Barozzi</a> * @author <a href="mailto:[EMAIL PROTECTED]">Gianugo Rabellino</a> * * @version CVS $Id: NekoHTMLGenerator.java,v 1.1 2004/07/08 10:11:41 upayavira Exp $ */ public class NekoHTMLGenerator extends ServiceableGenerator implements Configurable, CacheableProcessingComponent, Disposable { /** The parameter that specifies what request attribute to use, if any */ public static final String FORM_NAME = "form-name"; /** The source, if coming from a file */ private Source inputSource; /** The source, if coming from the request */ private InputStream requestStream; /** XPATH expression */ private String xpath = null; /** XPath Processor */ private XPathProcessor processor = null; /** JTidy properties */ private Properties properties; public void service(ServiceManager manager) throws ServiceException { super.service( manager ); this.processor = (XPathProcessor)this.manager.lookup(XPathProcessor.ROLE); } public void configure(Configuration config) throws ConfigurationException { String configUrl = config.getChild("neko-config").getValue(null); if(configUrl != null) { org.apache.excalibur.source.SourceResolver resolver = null; Source configSource = null; try { resolver = (org.apache.excalibur.source.SourceResolver)this.manager.lookup(org.apache.excalibur.source.SourceResolver.ROLE); configSource = resolver.resolveURI(configUrl); if (getLogger().isDebugEnabled()) { getLogger().debug("Loading configuration from " + configSource.getURI()); } this.properties = new Properties(); this.properties.load(configSource.getInputStream()); } catch (Exception e) { getLogger().warn("Cannot load configuration from " + configUrl); throw new ConfigurationException("Cannot load configuration from " + configUrl, e); } finally { if ( null != resolver ) { this.manager.release(resolver); resolver.release(configSource); } } } } /** * Recycle this component. * All instance variables are set to <code>null</code>. */ public void recycle() { if (this.inputSource != null) { this.resolver.release( this.inputSource ); this.inputSource = null; this.requestStream = null; } this.xpath = null; super.recycle(); } /** * Setup the html generator. * Try to get the last modification date of the source for caching. */ public void setup(SourceResolver resolver, Map objectModel, String src, Parameters par) throws ProcessingException, SAXException, IOException { super.setup(resolver, objectModel, src, par); Request request = ObjectModelHelper.getRequest(objectModel); if (src == null) { // Handle this request as the StreamGenerator does (from the POST // request or from a request parameter), but try to make sure // that the output will be well-formed String contentType = request.getContentType(); if (contentType == null ) { throw new IOException("Content-type was not specified for this request"); } else if (contentType.startsWith("application/x-www-form-urlencoded") || contentType.startsWith("multipart/form-data")) { String requested = parameters.getParameter(FORM_NAME, null); if (requested == null) { throw new ProcessingException( "NekoHtmlGenerator with no \"src\" parameter expects a sitemap parameter called '" + FORM_NAME + "' for handling form data" ); } String sXml = request.getParameter(requested); requestStream = new ByteArrayInputStream(sXml.getBytes()); } else if (contentType.startsWith("text/plain") || contentType.startsWith("text/xml") || contentType.startsWith("application/xml")) { HttpServletRequest httpRequest = (HttpServletRequest) objectModel.get(HttpEnvironment.HTTP_REQUEST_OBJECT); if ( httpRequest == null ) { throw new ProcessingException("This functionality only works in an http environment."); } int len = request.getContentLength(); if (len > 0) { requestStream = new PostInputStream(httpRequest.getInputStream(), len); } else { throw new IOException("getContentLen() == 0"); } } else { throw new IOException("Unexpected getContentType(): " + request.getContentType()); } } xpath = request.getParameter("xpath"); if(xpath == null) xpath = par.getParameter("xpath",null); // append the request parameter to the URL if necessary if (par.getParameterAsBoolean("copy-parameters", false) && request.getQueryString() != null) { StringBuffer query = new StringBuffer(super.source); query.append(super.source.indexOf("?") == -1 ? '?' : '&'); query.append(request.getQueryString()); super.source = query.toString(); } try { if (source != null) this.inputSource = resolver.resolveURI(super.source); } catch (SourceException se) { throw SourceUtil.handle("Unable to resolve " + super.source, se); } } /** * Generate the unique key. * This key must be unique inside the space of this component. * This method must be invoked before the generateValidity() method. * * @return The generated key or <code>0</code> if the component * is currently not cacheable. */ public java.io.Serializable getKey() { if (this.inputSource == null) return null; if (this.xpath != null) { StringBuffer buffer = new StringBuffer(this.inputSource.getURI()); buffer.append(':').append(this.xpath); return buffer.toString(); } else { return this.inputSource.getURI(); } } /** * Generate the validity object. * Before this method can be invoked the generateKey() method * must be invoked. * * @return The generated validity object or <code>null</code> if the * component is currently not cacheable. */ public SourceValidity getValidity() { if (this.inputSource == null) return null; return this.inputSource.getValidity(); } /** * Generate XML data. */ public void generate() throws IOException, SAXException, ProcessingException { try { HtmlSaxParser parser = new HtmlSaxParser(this.properties); if (inputSource != null) requestStream = this.inputSource.getInputStream(); if(xpath != null) { DOMBuilder builder = new DOMBuilder(); parser.setContentHandler(builder); parser.parse(new InputSource(requestStream)); Document doc = builder.getDocument(); DOMStreamer domStreamer = new DOMStreamer(this.contentHandler, this.lexicalHandler); this.contentHandler.startDocument(); NodeList nl = processor.selectNodeList(doc, xpath); int length = nl.getLength(); for(int i=0; i < length; i++) { domStreamer.stream(nl.item(i)); } this.contentHandler.endDocument(); } else { parser.setContentHandler(this.contentHandler); parser.parse(new InputSource(requestStream)); } requestStream.close(); } catch (IOException e){ throw new ResourceNotFoundException("Could not get resource " + this.inputSource.getURI(), e); } catch (SAXException e){ throw e; } catch (Exception e){ throw new ProcessingException("Exception in NekoHTMLGenerator.generate()",e); } } public void dispose() { if (this.manager != null) { this.manager.release(this.processor); this.manager = null; } this.processor = null; super.dispose(); } public static class HtmlSaxParser extends AbstractSAXParser { public HtmlSaxParser(Properties properties) { super(getConfig(properties)); } private static HTMLConfiguration getConfig(Properties properties) { HTMLConfiguration config = new HTMLConfiguration(); config.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); if (properties != null) { for (Iterator i = properties.keySet().iterator();i.hasNext();) { String name = (String) i.next(); config.setProperty(name, properties.getProperty(name)); } } return config; } } } 1.1 cocoon-2.1/src/blocks/html/lib/nekohtml-0.9.2.jar <<Binary file>>