html2xdoc Html2XdocBean.java

jstrachan Thu, 06 Mar 2003 11:15:00 -0800

jstrachan    2003/03/06 11:15:12

  Added:       src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc
                        TestHtml2Xdoc.java output1.xml input1.html
               src/plugins-build/html2xdoc/xdocs goals.xml index.xml
                        .cvsignore navigation.xml properties.xml
               src/plugins-build/html2xdoc plugin.jelly project.xml
                        .cvsignore maven.xml
               src/plugins-build/html2xdoc/src/main/org/apache/maven/html2xdoc
                        Html2XdocBean.java
  Log:
  Initial version of a plugin which can take HTML and turn it into the xdoc format 
ready for styling by the xdoc plugin.
  
  This allows developers to use good old HTML (and some WYSIWYG tool) to edit & 
maintain documentation and get it included in the same site documentation as all the 
xdoc stuff
  
  Revision  Changes    Path
  1.1                  
jakarta-turbine-maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/TestHtml2Xdoc.java
  
  Index: TestHtml2Xdoc.java
  ===================================================================
  /*
   * 
/home/cvs/jakarta-turbine-maven/src/plugins-build/jellydoc/src/main/org/apache/maven/jellydoc/TagXMLDoclet.java,v
 1.1 2003/02/07 12:10:44 jstrachan Exp
   * 1.1
   * 2003/02/07 12:10:44
   *
   * ====================================================================
   *
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2002 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution, if
   *    any, must include the following acknowlegement:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowlegement may appear in the software itself,
   *    if and wherever such third-party acknowlegements normally appear.
   *
   * 4. The names "The Jakarta Project", "Commons", and "Apache Software
   *    Foundation" must not be used to endorse or promote products derived
   *    from this software without prior written permission. For written
   *    permission, please contact [EMAIL PROTECTED]
   *
   * 5. Products derived from this software may not be called "Apache"
   *    nor may "Apache" appear in their names without prior written
   *    permission of the Apache Group.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   * 
   * TagXMLDoclet.java,v 1.1 2003/02/07 12:10:44 jstrachan Exp
   */
  
  package org.apache.maven.html2xdoc;
  
  import java.io.IOException;
  import java.io.StringWriter;
  import java.net.URL;
  import java.util.Iterator;
  import java.util.LinkedList;
  import java.util.List;
  
  import org.dom4j.CharacterData;
  import org.dom4j.Document;
  import org.dom4j.DocumentFactory;
  import org.dom4j.Element;
  import org.dom4j.Node;
  import org.dom4j.io.OutputFormat;
  import org.dom4j.io.SAXReader;
  import org.dom4j.io.XMLWriter;
  
  import org.cyberneko.html.parsers.SAXParser;
  
  import junit.framework.Test;
  import junit.framework.TestCase;
  import junit.framework.TestSuite;
  import junit.textui.TestRunner;
  
  /**
   * A test harness for the HTML to XDOC converter 
   * 
   * @author <a href="mailto:[EMAIL PROTECTED]">James Strachan</a>
   */
  public class TestHtml2Xdoc extends TestCase {
      
      public static void main( String[] args ) {
          TestRunner.run( suite() );
      }
  
      public static Test suite() {
          return new TestSuite( TestHtml2Xdoc.class );
      }
  
      public TestHtml2Xdoc(String testName) {
          super(testName);
      }
  
      // Test cases
      //-------------------------------------------------------------------------
      public void testOne() throws Exception {
                assertConversion("input1.html", "output1.xml");
      }
      
      // Implementation methods
      //-------------------------------------------------------------------------
      protected void assertConversion(String input, String output) throws Exception {
          Html2XdocBean converter = createConverter();
          Document inputDoc = parseHtml(input);
          Document expectedDoc = parse(output);
          
          Document actualDoc = converter.convert(inputDoc);
          
          assertEqual("Output for: " + input + " does not match: " + output, 
expectedDoc, actualDoc);
      }
  
      /**
       * Asserts that the given two documents are equal
       * 
       * @param string
       * @param expectedDoc
       * @param actualDoc
       */
      protected void assertEqual(
          String message,
          Document expectedDoc,
          Document actualDoc) throws IOException {
  
                String expectedText = getPrettyPrintText(expectedDoc);
                String actualText = getPrettyPrintText(actualDoc);
                
                if (!expectedText.equals(actualText)) {
              System.out.println("Expected: " + expectedText);  
              System.out.println("Actual: " + actualText);      
                }
          assertEquals(message, expectedText, actualText);    
      }
  
      /**
       * @param expectedDoc
       * @return Object
       */
      protected String getPrettyPrintText(Document doc) throws IOException {
          OutputFormat format = OutputFormat.createPrettyPrint();
          StringWriter buffer = new StringWriter();
          XMLWriter writer = new XMLWriter(buffer, format);
          writer.write(doc);
          writer.close();
          return buffer.toString();
      }
  
      /**
       * Parses the given String URI on the classpath and returns the docuemnt
       * 
       * @param input
       * @return Document
       */
      protected Document parse(String input) throws Exception {
          URL url = getClassURL(input);
          SAXReader saxReader = new SAXReader();   
          return saxReader.read(url);
      }
  
      /**
       * Parses the given HTML using a String URI on the classpath 
       * and returns the docuemnt
       * 
       * @param input
       * @return Document
       */
      protected Document parseHtml(String input) throws Exception {
          URL url = getClassURL(input);
          SAXParser htmlParser = new SAXParser();
          htmlParser.setProperty(
              "http://cyberneko.org/html/properties/names/elems";,
              "lower"
          );
          htmlParser.setProperty(
              "http://cyberneko.org/html/properties/names/attrs";,
              "lower"
          );
          SAXReader saxReader = new SAXReader(htmlParser);
          return saxReader.read(url);
      }
  
        protected URL getClassURL(String input) throws Exception {
          URL url = getClass().getResource(input);
          assertTrue("Could not find resource on classpath for: " + input, url != 
null);
          return url;
        }
        
      protected Html2XdocBean createConverter() {
          return new Html2XdocBean();
      }
  }
  
  
  
  1.1                  
jakarta-turbine-maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/output1.xml
  
  Index: output1.xml
  ===================================================================
  <document>
    <properties>
      <title>A title</title>
    </properties>
    <body>
      <section name="A H1">
        <p>Some text
        </p>
        <br/>
        <p>More text
        </p>
                        <p>Some consective pres</p>
                        <source>hi there</source>
      </section>
    </body>
  </document>
  
  
  1.1                  
jakarta-turbine-maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/input1.html
  
  Index: input1.html
  ===================================================================
  <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
  <html>
  <head>
    <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
    <title>A title</title>
  </head>
  <body>
        
  <h1>A H1</h1>
  
  Some text
  <br/>
  More text
  
  <p>Some consective pres</p>
  <pre>hi</pre>
  <pre>there</pre>
  </body>
  </html>
  
  
  
  1.1                  
jakarta-turbine-maven/src/plugins-build/html2xdoc/xdocs/goals.xml
  
  Index: goals.xml
  ===================================================================
  <?xml version="1.0" encoding="ISO-8859-1"?>
  <document>
  
    <properties>
      <title>Html2XDoc Plugin Goals</title>
      <author email="[EMAIL PROTECTED]">James Strachan</author>
    </properties>
  
    <body>
      <section name="Goals">
        <table>
                <tr><th>Goal</th><th>Description</th></tr>
                
                                <a name="html2xdoc" />
                <tr>
                  <td>html2xdoc</td>
                  <td>
                    The default goal. This goal generates xdoc documents from some 
basic 
                    HTML documentation which can then be styled by the xdoc goal to use
                    a uniform CSS style and add the common navigation links etc.
                    <br/>
                    This goal means that normal vanilla HTML can be used to create 
documentation
                    using a WYSIWYG editing tool like Mozilla or MS FrontPage. 
                    <br/>
                    Most end user documentation
                    only requires basic formatting (bold, italic, bullets) along with 
headings and images and
                    links so using normal HTML for docuemntation with a WYSIWYG tool 
seems like a good idea.
                  </td>
                </tr>
                
        </table>
      </section>
    </body>
  </document>
  
  
  
  1.1                  
jakarta-turbine-maven/src/plugins-build/html2xdoc/xdocs/index.xml
  
  Index: index.xml
  ===================================================================
  <?xml version="1.0"?>
  <document>
  
    <properties>
      <title>Maven Html2XDoc Plug-in</title>
      <author email="[EMAIL PROTECTED]">James Strachan</author>
    </properties>
  
    <body>
      <section name="Maven Html2XDoc Plug-in">
        <p>
                                This plugin generates xdoc documents from some basic 
                                HTML documentation which can then be styled by the 
xdoc goal to use
                                a uniform CSS style and add the common navigation 
links etc.
        </p>
        <p>
                                This goal means that normal vanilla HTML can be used 
to create documentation
                                using a WYSIWYG editing tool like Mozilla or MS 
FrontPage. 
        </p>
        <p>
                                Most end user documentation
                                only requires basic formatting (bold, italic, bullets) 
along with headings and images and
                                links so using normal HTML for documentation with a 
WYSIWYG tool is a good idea.        
        </p>
        <p>
          The properties that allow you to customize the execution 
          are documented <a href="properties.html">here</a>.
        </p>
        <p>
                To enable HTML to xdoc conversion add the following to your maven.xml
        </p>
        <source>
    &lt;preGoal name="xdoc:jelly-transform"&gt;
      &lt;attainGoal name="html2xdoc"/&gt;
    &lt;/preGoal&gt;            
        </source>
      </section>
   </body>
  </document>
  
  
  
  1.1                  
jakarta-turbine-maven/src/plugins-build/html2xdoc/xdocs/.cvsignore
  
  Index: .cvsignore
  ===================================================================
  stylesheets
  
  
  
  1.1                  
jakarta-turbine-maven/src/plugins-build/html2xdoc/xdocs/navigation.xml
  
  Index: navigation.xml
  ===================================================================
  <?xml version="1.0" encoding="ISO-8859-1"?>
  <project name="Maven Html2XDoc Plugin">
  
    <title>Maven Html2XDoc Plugin</title>
  
    <body>
      <links>
        <item name="Maven"    href="http://jakarta.apache.org/turbine/maven/"/>
        <item name="Jelly"    href="http://jakarta.apache.org/commons/jelly/"/>
      </links>
      <menu name="Overview">
        <item name="Goals"                   href="/goals.html" />
        <item name="Properties"              href="/properties.html" />
      </menu>
    </body>
  </project>
  
  
  
  1.1                  
jakarta-turbine-maven/src/plugins-build/html2xdoc/xdocs/properties.xml
  
  Index: properties.xml
  ===================================================================
  <?xml version="1.0" encoding="ISO-8859-1"?>
  <document>
  
    <properties>
      <title>Html2XDoc Properties</title>
      <author email="[EMAIL PROTECTED]">James Strachan</author>
    </properties>
  
    <body>
      <section name="Html2XDoc Settings"> 
        <table>
          <tr><th>Property</th><th>Optional?</th><th>Description</th></tr>
          <tr>
            <td>maven.html2xdoc.dir</td>
            <td>Yes (default=xdoc)</td>
            <td>
              Specifies the directory containing HTML documents which will be
              turned into xdoc XML documents in the 
                                                <code>target/generated-xdoc</code> 
directory
              ready for use by the xdoc plugin.
            </td>
          </tr>
          <tr>
            <td>maven.html2xdoc.jsl</td>
            <td>Yes</td>
            <td>
                Allows the JSL stylesheet to be specified to perform some custom
                HTML to XDoc transformation.
            </td>
          </tr>
        </table>
      </section>
    </body>
  </document>
  
  
  
  1.1                  jakarta-turbine-maven/src/plugins-build/html2xdoc/plugin.jelly
  
  Index: plugin.jelly
  ===================================================================
  <?xml version="1.0"?>
  
  <project
    xmlns:j="jelly:core"
    xmlns:x="jelly:xml"  
    xmlns:html="jelly:html"
    xmlns:maven="jelly:maven"
    xmlns:log="jelly:log"
    xmlns:util="jelly:util"
    xmlns:doc="doc"
    xmlns:m="maven">
  
    <goal name="html2xdoc" prereqs="xdoc:init"
      description="Generates XDoc documentation from normal HTML files">
  
      <j:set var="destdir" value="${maven.gen.docs}"/>
      <j:if test="${empty(destdir)}">
        <j:set var="destdir" value="${maven.build.dir}/generated-xdocs"/>
      </j:if>
      
      <j:set var="srcdir" value="${maven.html2xdoc.dir}"/>    
      <j:if test="${empty(srcdir)}">
        <j:set var="srcdir" value="${basedir}/xdocs"/>
      </j:if>
      
      <j:set var="outputencoding" value="${maven.docs.outputencoding}"/>
  
      <!-- tool for converting HTML into XDoc -->
      <j:useBean class="org.apache.maven.html2xdoc.Html2XdocBean" var="htmlTool"/>
  
      <!-- mapper for determining output file name -->
      <j:new var="mapper" 
className="org.apache.maven.util.CaseInsensitiveGlobPatternMapper"/>
  
                <util:file var="srcdirFile" name="${srcdir}"/>
                <j:set var="srcdir" value="${srcdirFile.getCanonicalPath()}"/>
                <util:file var="destdirFile" name="${destdir}"/>
                <j:set var="destdir" value="${destdirFile.getCanonicalPath()}"/>
                 
  
        <util:replace var="fromPattern" oldChar="\" newChar="/" 
value="${srcdir}/*.html"/>
        <util:replace var="toPattern" oldChar="\" newChar="/" 
value="${destdir}/*.xml"/>
        <j:setProperties object="${mapper}" from="${fromPattern}" to="${toPattern}"/>
  
        <util:replace var="fromDirPattern" oldChar="\" newChar="/" value="${srcdir}*"/>
        <util:replace var="toDirPattern" oldChar="\" newChar="/" value="${destdir}*"/>
        
        <j:new var="dirMapper" 
className="org.apache.maven.util.CaseInsensitiveGlobPatternMapper"/>
        <j:setProperties object="${dirMapper}" from="${fromDirPattern}" 
to="${toDirPattern}"/>
  
        <fileScanner var="docFiles">
          <fileset dir="${srcdir}">
            <patternset>
              <include name="**/*.html"/>
            </patternset>
          </fileset>
        </fileScanner>
  
        <j:forEach var="file" items="${docFiles.iterator()}">
  
                                <util:replace var="inDirForward" oldChar="\" 
newChar="/" value="${file.parent}"/>
          <j:set var="outDir" value="${dirMapper.mapFileName(inDirForward).0}"/>
          <mkdir dir="${outDir}"/>
  
  
          <!-- generate output file name -->
          <util:replace var="outFileForward" oldChar="\" newChar="/" 
value="${file.toString()}"/>
          <j:set var="outFiles" value="${mapper.mapFileName(outFileForward)}"/>
          <j:set var="outFile" value="${outFiles[0]}"/>
  
          <echo>Generating ${outFile} from ${file}</echo>
          
          <!-- parse the HTML doc and pass it to the stylesheet -->
          <html:parse var="htmlDoc" html="${file}" element="lower" attribute="lower"/>
          
          <!-- now lets convert the document into XML -->
          <j:set var="xmlDoc" value="${htmlTool.convert(htmlDoc)}"/>
  
          <j:file name="${outFile}" encoding="${outputencoding}"
            omitXmlDeclaration="true" outputMode="xml"
            prettyPrint="true">
  
                                        <x:copyOf select="$xmlDoc"/>
          </j:file>
        </j:forEach>
    </goal>
  
  </project>
  
  
  
  1.1                  jakarta-turbine-maven/src/plugins-build/html2xdoc/project.xml
  
  Index: project.xml
  ===================================================================
  <?xml version="1.0" encoding="UTF-8"?>
  
  <project>
    <extend>${basedir}/../project.xml</extend>
    <pomVersion>3</pomVersion>
    <id>maven-html2xdoc-plugin</id>
    <name>Maven Html2XDoc Plug-in</name>
    <currentVersion>1.0</currentVersion>
    <!-- Gump integration -->
  
    <gumpRepositoryId>jakarta</gumpRepositoryId>
    <description>Creates XDoc documentation from normal HTML 
documentation.</description>
    <shortDescription/>
    <url>http://jakarta.apache.org/turbine/maven/reference/plugins/html2xdoc/</url>
  
    
<siteDirectory>/www/jakarta.apache.org/turbine/maven/reference/plugins/html2xdoc/</siteDirectory>
  
    <repository>
      <connection>scm:cvs:pserver:[EMAIL 
PROTECTED]:/home/cvspublic:jakarta-turbine-maven/src/plugins-build/html2xdoc/</connection>
      
<url>http://cvs.apache.org/viewcvs/jakarta-turbine-maven/src/plugins-build/html2xdoc/</url>
    </repository>
  
    <developers>
      <developer>
        <name>James Strachan</name>
        <id>jstrachan</id>
        <email>[EMAIL PROTECTED]</email>
        <organization>SpiritSoft, Inc.</organization>
        <roles>
          <role>Java Developer</role>
        </roles>
      </developer>
    </developers>
    
    <dependencies>
      <dependency>
        <groupId>commons-jelly</groupId>
        <artifactId>commons-jelly-tags-xml</artifactId>
        <version>20030211.142705</version>
        <url>http://jakarta.apache.org/commons/jelly/tags/xml/</url>
        <properties>
          <classloader>root.maven</classloader>
        </properties>
      </dependency>
      <dependency>
        <groupId>commons-jelly</groupId>
        <artifactId>commons-jelly-tags-html</artifactId>
        <version>SNAPSHOT</version>
        <url>http://jakarta.apache.org/commons/jelly/tags/html/</url>
        <properties>
          <classloader>root.maven</classloader>
        </properties>
      </dependency>
      <dependency>
        <id>nekohtml</id>
        <version>0.7.1</version>
        <properties>
          <classloader>root.maven</classloader>
        </properties>
      </dependency>
      
      <!-- core dependencies which i'd have thought Maven introduced -->
      <dependency>
        <id>commons-logging</id>
        <version>1.0.1</version>
        <properties>
          <classloader>root.maven</classloader>
        </properties>
      </dependency>
      <dependency>
        <id>dom4j</id>
        <version>1.4-dev-8</version>
        <url>http://www.dom4j.org/</url>
      </dependency>
      <dependency>
        <id>xml-apis</id>
        <version>1.0.b2</version>
        <url>http://xml.apache.org/xerces2-j/</url>
      </dependency>
      <dependency>
        <id>xerces</id>
        <version>2.2.1</version>
        <url>http://xml.apache.org/xerces2-j/</url>
      </dependency>
    </dependencies>
  
    <build>
      <sourceDirectory>src/main</sourceDirectory>
      <unitTestSourceDirectory>src/test</unitTestSourceDirectory>
      <unitTest>
        <includes>
          <include>**/Test*.java</include>
        </includes>
                        <resources>
                                <resource>
                                        <directory>src/test</directory>
                                        <includes>
                                                <include>*.xml</include>
                                                <include>*.html</include>
                                        </includes>
                                </resource>
                        </resources>
      </unitTest>
    </build>
  
  </project>
  
  
  1.1                  jakarta-turbine-maven/src/plugins-build/html2xdoc/.cvsignore
  
  Index: .cvsignore
  ===================================================================
  target
  maven.log
  
  
  
  1.1                  jakarta-turbine-maven/src/plugins-build/html2xdoc/maven.xml
  
  Index: maven.xml
  ===================================================================
  <project default="plugin:install">
  
    <!-- copies unit testing stuff: don't know why we need this! -->
    <postGoal name="test:compile">
      <copy todir="target/test-classes">
        <fileset dir="src/test">
          <include name="**/*.html"/>
          <include name="**/*.xml"/>
        </fileset>
      </copy>
    </postGoal>
  </project>
  
  
  
  1.1                  
jakarta-turbine-maven/src/plugins-build/html2xdoc/src/main/org/apache/maven/html2xdoc/Html2XdocBean.java
  
  Index: Html2XdocBean.java
  ===================================================================
  /*
   * 
/home/cvs/jakarta-turbine-maven/src/plugins-build/jellydoc/src/main/org/apache/maven/jellydoc/TagXMLDoclet.java,v
 1.1 2003/02/07 12:10:44 jstrachan Exp
   * 1.1
   * 2003/02/07 12:10:44
   *
   * ====================================================================
   *
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2002 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution, if
   *    any, must include the following acknowlegement:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowlegement may appear in the software itself,
   *    if and wherever such third-party acknowlegements normally appear.
   *
   * 4. The names "The Jakarta Project", "Commons", and "Apache Software
   *    Foundation" must not be used to endorse or promote products derived
   *    from this software without prior written permission. For written
   *    permission, please contact [EMAIL PROTECTED]
   *
   * 5. Products derived from this software may not be called "Apache"
   *    nor may "Apache" appear in their names without prior written
   *    permission of the Apache Group.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   * 
   * TagXMLDoclet.java,v 1.1 2003/02/07 12:10:44 jstrachan Exp
   */
  
  package org.apache.maven.html2xdoc;
  
  import java.util.ArrayList;
  import java.util.Iterator;
  import java.util.LinkedList;
  import java.util.List;
  
  import org.apache.commons.logging.Log;
  import org.apache.commons.logging.LogFactory;
  import org.dom4j.CharacterData;
  import org.dom4j.Document;
  import org.dom4j.DocumentFactory;
  import org.dom4j.Element;
  import org.dom4j.Node;
  
  /**
   * A simple bean for converting a HTML document into an XDoc compliant XML document.
   * This could be done via XSLT but is a little more complex than it might first
   * appear so its done via Java code instead.  
   * 
   * @author <a href="mailto:[EMAIL PROTECTED]">James Strachan</a>
   */
  public class Html2XdocBean {
      
      /** The Log to which logging calls will be made. */
      private static final Log log = LogFactory.getLog(Html2XdocBean.class);
      
      private DocumentFactory factory = new DocumentFactory(); 
  
        /**
         * Converts the given HTML document into the corresponding XDoc format
         * of XML
         * 
         * @param html
         * @return Document
         */
        public Document convert(Document html) {
            Document doc = factory.createDocument();
            Element root = doc.addElement("document");
            Element properties = root.addElement("properties");
            Element title = properties.addElement("title");
            title.setText(html.valueOf("/html/head/title"));
            
            Element body = root.addElement("body");
            
            Element htmlContent = (Element) html.selectSingleNode("/html/body");
          if (htmlContent == null) {
              log.info("No body element found for HTML document: " + html.asXML());
          }
          else {
              addSections(body, htmlContent);
          }
            return doc;     
        }
        
        /**
         * Iterates thorugh the given body looking for h1, h2, h3 nodes and
         * creating the associated section elements. Any text nodes 
         * contained inside the body are wrapped in a &lt;p&gt; element
         * 
         * @param output the output destination
         * @param body the block of HTML markup to convert
         */
        protected void addSections(Element output, Element body) {
            List content = getBodyContent(body.content());
            Element section = null;         
          Element p = null;
          
            for (Iterator iter = content.iterator(); iter.hasNext(); ) {
                Node node = (Node) iter.next();
  
              String name = node.getName();
              if (name != null && name.startsWith("h")) {
                  /** @todo we should handle child headings as a nested section */
                  section = output.addElement("section");
                  section.addAttribute("name", node.getText());                
                  p = null; 
                        }
                        else {
                            if (section == null ) {
                                // we have a section with no name
                                // should we default it to be the same as the document 
title?
                                section = output.addElement("section");
                            }
                            
                  if (node instanceof CharacterData) {
                      // lets add a <p>
                      if (p == null) { 
                        p = section.addElement("p");
                      }
                      p.addText( node.getText() );
                  }
                  else {
                      section.add(cloneNode(node));
                      p = null; 
                  }
              }
            }
        }
        
        /**
         * @param node
         * @return true if the given node is a heading element (h1, h2, h3 etc)
         */
        protected boolean isHeading(Node node) {
                String name = node.getName();
                return name != null && name.startsWith("h");
        }
        
        /**
         * Returns a copy of the body content, removing any whitespace from the 
beginning and end 
         * @param body
         * @return List
         */
      protected List getBodyContent(List content) {
          // lets turn <pre> into <source> and concatenate consective entries 
          Element lastPre = null;
          LinkedList  list = new LinkedList();
          boolean lastWasElement = true;
          for (Iterator iter = content.iterator(); iter.hasNext(); ) {
              Node node = (Node) iter.next();
              
              if (isPre(node)) {
                  if (lastPre == null) {
                      lastPre = factory.createElement("source");
                      list.add(lastPre);
                  }
                  lastPre.addText(node.getText());
              }
              else {
                  if (isWhitespace(node) && lastWasElement) {
                      if (lastPre != null) {
                          lastPre.addText(node.getText());
                      }
                  }
                  else {
                      lastWasElement = node instanceof Element;
                      if (lastWasElement) {
                        lastPre = null;
                      }                
                        list.add(node);
                  }
              }
          }        
  
          // now lets remove any whitespace text nodes at the beginning and end
          while (true) {
              Node node = (Node) list.getFirst();
              if (isWhitespace(node)) {
                  list.removeFirst();
                  continue;
              }
              break;
          }
          while (true) {
              Node node = (Node) list.getLast();
              if (isWhitespace(node)) {
                  list.removeLast();
                  continue;
              }
              break;
          }
          return list;
      }
      
      protected boolean isPre(Node node) {
          if (node instanceof Element) {
              Element element = (Element) node;
              return element.getName().equals("pre");
          }
          return false;
      }
  
      /**
       * @return true if the given node is a whitespace text node 
       */
      protected boolean isWhitespace(Node node) {
          if (node instanceof CharacterData) {
              String text = node.getText();
              return text.trim().length() <= 0;
          }
          if (node instanceof Element) {
              String name = node.getName(); 
                        if (name.equals("p")) {
                  String text = node.getText();
                  return text.trim().length() <= 0;
                        }
                        if (name.equals("br")) {
                            return true;
                        }
          }
          return false;
      }
  
        /**
         * Normalizes the whitespace of any Elements
         * @param node
         * @return Node
         */    
      protected Node cloneNode(Node node) {
          Node answer = (Node) node.clone(); 
          if (answer instanceof Element) {
              Element element = (Element) answer;
              element.normalize();
          }
          return answer;
      }
          
  }


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: jakarta-turbine-maven/src/plugins-build/html2xdoc/src/main/org/apache/maven/html2xdoc Html2XdocBean.java

Reply via email to