html2xdoc Html2XdocBean.java

dion Mon, 04 Aug 2003 07:10:49 -0700

dion        2003/08/04 07:12:58

  Modified:    src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc
                        TestHtml2Xdoc.java
               src/plugins-build/html2xdoc/src/main/org/apache/maven/html2xdoc
                        Html2XdocBean.java
  Added:       src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc
                        h3h4.html h1h2.html comment.html comment.xml
                        h1h2.xml link.xml link.html
  Log:
  Tests pass and it appears to work.
  
  Applied patch for MAVEN-550
  
  Revision  Changes    Path
  1.2       +4 -0      
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/TestHtml2Xdoc.java
  
  Index: TestHtml2Xdoc.java
  ===================================================================
  RCS file: 
/home/cvs/maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/TestHtml2Xdoc.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- TestHtml2Xdoc.java        6 Mar 2003 19:15:10 -0000       1.1
  +++ TestHtml2Xdoc.java        4 Aug 2003 14:12:58 -0000       1.2
  @@ -108,6 +108,10 @@
       //-------------------------------------------------------------------------
       public void testOne() throws Exception {
                assertConversion("input1.html", "output1.xml");
  +             assertConversion("h1h2.html", "h1h2.xml");
  +             assertConversion("h3h4.html", "h1h2.xml");
  +             assertConversion("link.html", "link.xml");
  +             assertConversion("comment.html", "comment.xml");
       }
       
       // Implementation methods
  
  
  
  1.1                  
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/h3h4.html
  
  Index: h3h4.html
  ===================================================================
  <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
  <html>
  <head>
    <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
    <title>A title</title>
  </head>
  <body>
        
  <h3>A section title</h3>
  
  Some text
  <br/>
  More text
  
  <h4>a subsection</h4>
  
  This is a subsection. It only has this.
  
  </body>
  </html>
  
  
  
  1.1                  
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/h1h2.html
  
  Index: h1h2.html
  ===================================================================
  <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
  <html>
  <head>
    <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
    <title>A title</title>
  </head>
  <body>
        
  <h1>A section title</h1>
  
  Some text
  <br/>
  More text
  
  <h2>a subsection</h2>
  
  This is a subsection. It only has this.
  
  </body>
  </html>
  
  
  
  1.1                  
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/comment.html
  
  Index: comment.html
  ===================================================================
  <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
  <html>
  <head>
    <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
    <title>A title</title>
  </head>
  <body>
        
  <h1>A section title</h1>
  <!-- #aComment entry -->
  Some text with a <a href="somewhere">link</a>.
  </body>
  </html>
  
  
  
  1.1                  
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/comment.xml
  
  Index: comment.xml
  ===================================================================
  <document>
    <properties>
      <title>A title</title>
    </properties>
    <body>
      <section name="A section title">
        <!-- #aComment entry -->
  
        <p>Some text with a <a href="somewhere">link</a>.</p>
      </section>
    </body>
  </document>
  
  
  
  1.1                  
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/h1h2.xml
  
  Index: h1h2.xml
  ===================================================================
  <document>
    <properties>
      <title>A title</title>
    </properties>
    <body>
      <section name="A section title">
                        <p>Some text
                        </p>
                        <br/>
                        <p>More text
                        </p>
                        <subsection name="a subsection">
                                <p>This is a subsection. It only has this.
                                </p>
                        </subsection>
      </section>
    </body>
  </document>
  
  
  
  1.1                  
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/link.xml
  
  Index: link.xml
  ===================================================================
  <document>
    <properties>
      <title>A title</title>
    </properties>
    <body>
      <section name="A section title">
        <p>Some text with a <a href="somewhere">link</a>.</p>
      </section>
    </body>
  </document>
  
  
  
  1.1                  
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/link.html
  
  Index: link.html
  ===================================================================
  <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
  <html>
  <head>
    <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
    <title>A title</title>
  </head>
  <body>
        
  <h1>A section title</h1>
  
  Some text with a <a href="somewhere">link</a>.
  </body>
  </html>
  
  
  
  1.2       +264 -110  
maven/src/plugins-build/html2xdoc/src/main/org/apache/maven/html2xdoc/Html2XdocBean.java
  
  Index: Html2XdocBean.java
  ===================================================================
  RCS file: 
/home/cvs/maven/src/plugins-build/html2xdoc/src/main/org/apache/maven/html2xdoc/Html2XdocBean.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- Html2XdocBean.java        6 Mar 2003 19:15:11 -0000       1.1
  +++ Html2XdocBean.java        4 Aug 2003 14:12:58 -0000       1.2
  @@ -62,7 +62,6 @@
   
   package org.apache.maven.html2xdoc;
   
  -import java.util.ArrayList;
   import java.util.Iterator;
   import java.util.LinkedList;
   import java.util.List;
  @@ -70,140 +69,289 @@
   import org.apache.commons.logging.Log;
   import org.apache.commons.logging.LogFactory;
   import org.dom4j.CharacterData;
  +import org.dom4j.Comment;
   import org.dom4j.Document;
   import org.dom4j.DocumentFactory;
   import org.dom4j.Element;
   import org.dom4j.Node;
   
   /**
  - * A simple bean for converting a HTML document into an XDoc compliant XML document.
  + * A simple bean for converting a HTML document into an XDoc compliant XML
  + * document.
    * This could be done via XSLT but is a little more complex than it might first
    * appear so its done via Java code instead.  
    * 
    * @author <a href="mailto:[EMAIL PROTECTED]">James Strachan</a>
    */
   public class Html2XdocBean {
  -    
  +
       /** The Log to which logging calls will be made. */
       private static final Log log = LogFactory.getLog(Html2XdocBean.class);
  -    
  -    private DocumentFactory factory = new DocumentFactory(); 
   
  -     /**
  -      * Converts the given HTML document into the corresponding XDoc format
  -      * of XML
  -      * 
  -      * @param html
  -      * @return Document
  -      */
  -     public Document convert(Document html) {
  -         Document doc = factory.createDocument();
  -         Element root = doc.addElement("document");
  -         Element properties = root.addElement("properties");
  -         Element title = properties.addElement("title");
  -         title.setText(html.valueOf("/html/head/title"));
  -         
  -         Element body = root.addElement("body");
  -         
  -         Element htmlContent = (Element) html.selectSingleNode("/html/body");
  +    /**
  +     * Used to create the output document
  +     */
  +    private DocumentFactory factory = new DocumentFactory();
  +
  +    /**
  +     * The current node to attach the sub-nodes.
  +     */
  +    private Element currentNode = null;
  +
  +    /**
  +     * The current 'root' section node. This is used to keep
  +     * track of the root section so that when a subsection is
  +     * found it can be associated correctly.
  +     */
  +    private Element currentSectionNode = null;
  +
  +    /**
  +     * The current section heading level. If a subsequent level
  +     * lower or equal, then create a new section.
  +     */
  +    private int currentSectionHeadingLevel = Integer.MIN_VALUE;
  +
  +    /**
  +     * The current paragraph node. This is used to associate text
  +     * and formatting nodes to the current paragraph node.
  +     */
  +    private Element currentParaNode = null;
  +
  +    /**
  +     * Converts the given HTML document into the corresponding XDoc format
  +     * of XML
  +     * 
  +     * @param html the input html document
  +     * @return Document
  +     */
  +    public Document convert(Document html) {
  +        Document doc = factory.createDocument();
  +        Element root = doc.addElement("document");
  +        Element properties = root.addElement("properties");
  +        Element title = properties.addElement("title");
  +        title.setText(html.valueOf("/html/head/title"));
  +
  +        Element body = root.addElement("body");
  +
  +        Element htmlContent = (Element) html.selectSingleNode("/html/body");
           if (htmlContent == null) {
  -            log.info("No body element found for HTML document: " + html.asXML());
  -        }
  -        else {
  +            log.info("No body element found for HTML document: "
  +                + html.asXML());
  +        } else {
               addSections(body, htmlContent);
           }
  -         return doc;     
  -     }
  -     
  -     /**
  -      * Iterates thorugh the given body looking for h1, h2, h3 nodes and
  -      * creating the associated section elements. Any text nodes 
  -      * contained inside the body are wrapped in a &lt;p&gt; element
  -      * 
  -      * @param output the output destination
  -      * @param body the block of HTML markup to convert
  -      */
  -     protected void addSections(Element output, Element body) {
  -         List content = getBodyContent(body.content());
  -         Element section = null;         
  -        Element p = null;
  -        
  -         for (Iterator iter = content.iterator(); iter.hasNext(); ) {
  -             Node node = (Node) iter.next();
  +        return doc;
  +    }
   
  -            String name = node.getName();
  -            if (name != null && name.startsWith("h")) {
  -                /** @todo we should handle child headings as a nested section */
  -                section = output.addElement("section");
  -                section.addAttribute("name", node.getText());                
  -                p = null; 
  -                     }
  -                     else {
  -                         if (section == null ) {
  -                             // we have a section with no name
  -                             // should we default it to be the same as the document 
title?
  -                             section = output.addElement("section");
  -                         }
  -                         
  -                if (node instanceof CharacterData) {
  -                    // lets add a <p>
  -                    if (p == null) { 
  -                     p = section.addElement("p");
  -                    }
  -                    p.addText( node.getText() );
  -                }
  -                else {
  -                    section.add(cloneNode(node));
  -                    p = null; 
  -                }
  +    /**
  +     * Iterates thorugh the given body looking for h1, h2, h3 nodes and
  +     * creating the associated section elements. Any text nodes 
  +     * contained inside the body are wrapped in a &lt;p&gt; element
  +     * 
  +     * @param output the output destination
  +     * @param body the block of HTML markup to convert
  +     */
  +    protected void addSections(Element output, Element body) {
  +        List content = getBodyContent(body.content());
  +
  +        for (Iterator iter = content.iterator(); iter.hasNext();) {
  +            Node node = (Node) iter.next();
  +            if (isHeading(node)) {
  +                makeSection(output, node);
  +            } else {
  +                guaranteeHasSection(output);
  +                processNode(node);
               }
  -         }
  -     }
  -     
  -     /**
  -      * @param node
  -      * @return true if the given node is a heading element (h1, h2, h3 etc)
  -      */
  -     protected boolean isHeading(Node node) {
  -             String name = node.getName();
  -             return name != null && name.startsWith("h");
  -     }
  -     
  -     /**
  -      * Returns a copy of the body content, removing any whitespace from the 
beginning and end 
  -      * @param body
  -      * @return List
  -      */
  +        }
  +
  +    }
  +
  +    /**
  +     * main algorithm which represents the iteration contract.
  +     * Use the protected methods to change the behavior.
  +     *
  +     * @param node the node to process
  +     */
  +    private void processNode(Node node) {
  +        if (isCharacterData(node)) {
  +            addTextNode(node);
  +        } else if (isTextFormatting(node)) {
  +            addFormattingNode(node);
  +        } else {
  +            addNode(node);
  +        }
  +    }
  +
  +    /**
  +     * Specifies whether the node is a text modifying construct that should be
  +     * passed as is to the resultant html. Such as an anchor '&lt;a&gt;'.
  +     * 
  +     * @param node the node to check
  +     * @return true if the node is used to modify the formatting of the
  +     *         text; otherwise, false 
  +     */
  +    protected boolean isTextFormatting(Node node) {
  +        // Ultimately this needs bold, italic, and so on
  +        return node.getName() != null && node.getName().equals("a");
  +    }
  +
  +    /**
  +     * Specifies whether the node is character data and should be passed as 
  +     * straight text to the resultant html.
  +     * 
  +     * @param node the node to check
  +     * @return true if the node is a text node; otherwise, false.
  +     */
  +    protected boolean isCharacterData(Node node) {
  +        return node instanceof CharacterData
  +            && (node instanceof Comment) == false;
  +    }
  +
  +    /**
  +     * Specifies whether the node is a heading node.
  +     * 
  +     * @param node the node to check
  +     * @return true if the given node is a heading element
  +     *         (h1, h2, h3 etc); otherwise, false
  +     */
  +    protected boolean isHeading(Node node) {
  +        String name = node.getName();
  +        return name != null && name.startsWith("h");
  +    }
  +
  +    /**
  +     * Determines the heading level of the node.
  +     * 
  +     * @param node the node to check
  +     * @return the integer level of the heading
  +     */
  +    protected int determineHeadingLevel(Node node) {
  +        try {
  +            String name = node.getName().substring(1);
  +            return Integer.parseInt(name);
  +        } catch (NumberFormatException nfe) {
  +            return 1;
  +        }
  +    }
  +
  +    /**
  +     * Creates a section or subsection as necessary based on the node
  +     * for the output document.
  +     * 
  +     * @param output the output document to attach the section
  +     * @param node the node to base making a section on
  +     */
  +    protected void makeSection(Element output, Node node) {
  +        int level = determineHeadingLevel(node);
  +        if (needsNewSection(node)) {
  +            currentNode = output.addElement("section");
  +            currentSectionHeadingLevel = level;
  +            currentSectionNode = currentNode;
  +        } else {
  +            currentNode = currentSectionNode.addElement("subsection");
  +        }
  +        currentNode.addAttribute("name", node.getText());
  +        currentParaNode = null;
  +    }
  +
  +    /**
  +     * Determines if a new section is needed which is based on whether
  +     * the node's a heading level and equal to or less than the current
  +     * section's heading level.
  +     *  
  +     * @param node the node to check
  +     * @return true if the current node's information means for a new
  +     *         section; otherwise, false
  +     */
  +    protected boolean needsNewSection(Node node) {
  +        int level = determineHeadingLevel(node);
  +        return level <= currentSectionHeadingLevel
  +            || currentSectionNode == null;
  +    }
  +
  +    /**
  +     * Determines if a paragraph node is needed.
  +     */
  +    private void guaranteeHasParaNode() {
  +        if (currentParaNode == null) {
  +            currentParaNode = currentNode.addElement("p");
  +        }
  +    }
  +
  +    /**
  +     * Makes sure the current node is section, if necessary.
  +     * @param output the output element to add the section to
  +     */
  +    private void guaranteeHasSection(Element output) {
  +        if (currentNode == null) {
  +            // we have a section with no name
  +            // should we default it to be the same as the document title?
  +            currentNode = output.addElement("section");
  +        }
  +    }
  +
  +    /**
  +     * Add the node to the current node.
  +     * @param node the node to add
  +     */
  +    private void addNode(Node node) {
  +        currentNode.add(cloneNode(node));
  +        currentParaNode = null;
  +    }
  +
  +    /**
  +     * Adds the text of the node to the current paragraph.
  +     * @param node the node to add
  +     */
  +    private void addTextNode(Node node) {
  +        guaranteeHasParaNode();
  +        currentParaNode.addText(node.getText());
  +    }
  +
  +    /**
  +     * Adds the node to the current paragraph.
  +     * @param node the node to add
  +     */
  +    private void addFormattingNode(Node node) {
  +        guaranteeHasParaNode();
  +        currentParaNode.add(cloneNode(node));
  +    }
  +
  +    /**
  +     * Returns a copy of the body content, removing any whitespace from
  +     * the beginning and end.
  +     *
  +     * @param content the content node list to obtain body content from
  +     * @return List
  +     */
       protected List getBodyContent(List content) {
           // lets turn <pre> into <source> and concatenate consective entries 
           Element lastPre = null;
  -        LinkedList  list = new LinkedList();
  +        LinkedList list = new LinkedList();
           boolean lastWasElement = true;
  -        for (Iterator iter = content.iterator(); iter.hasNext(); ) {
  +        for (Iterator iter = content.iterator(); iter.hasNext();) {
               Node node = (Node) iter.next();
  -            
  +
               if (isPre(node)) {
                   if (lastPre == null) {
                       lastPre = factory.createElement("source");
                       list.add(lastPre);
                   }
                   lastPre.addText(node.getText());
  -            }
  -            else {
  +            } else {
                   if (isWhitespace(node) && lastWasElement) {
                       if (lastPre != null) {
                           lastPre.addText(node.getText());
                       }
  -                }
  -                else {
  +                } else {
                       lastWasElement = node instanceof Element;
                       if (lastWasElement) {
  -                     lastPre = null;
  -                    }                
  -                     list.add(node);
  +                        lastPre = null;
  +                    }
  +                    list.add(node);
                   }
               }
  -        }        
  +        }
   
           // now lets remove any whitespace text nodes at the beginning and end
           while (true) {
  @@ -224,7 +372,11 @@
           }
           return list;
       }
  -    
  +
  +    /**
  +     * @param node the node to check
  +     * @return true if the node is a pre tag; otherwise false.
  +     */
       protected boolean isPre(Node node) {
           if (node instanceof Element) {
               Element element = (Element) node;
  @@ -234,6 +386,7 @@
       }
   
       /**
  +     * @param node the node to check
        * @return true if the given node is a whitespace text node 
        */
       protected boolean isWhitespace(Node node) {
  @@ -242,30 +395,31 @@
               return text.trim().length() <= 0;
           }
           if (node instanceof Element) {
  -            String name = node.getName(); 
  -                     if (name.equals("p")) {
  +            String name = node.getName();
  +            if (name.equals("p")) {
                   String text = node.getText();
                   return text.trim().length() <= 0;
  -                     }
  -                     if (name.equals("br")) {
  -                         return true;
  -                     }
  +            }
  +            if (name.equals("br")) {
  +                return true;
  +            }
           }
           return false;
       }
   
  -     /**
  -      * Normalizes the whitespace of any Elements
  -      * @param node
  -      * @return Node
  -      */    
  +    /**
  +     * Normalizes the whitespace of any Elements
  +     *
  +     * @param node the node to clone
  +     * @return Node the cloned node
  +     */
       protected Node cloneNode(Node node) {
  -        Node answer = (Node) node.clone(); 
  +        Node answer = (Node) node.clone();
           if (answer instanceof Element) {
               Element element = (Element) answer;
               element.normalize();
           }
           return answer;
       }
  -        
  +
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: maven/src/plugins-build/html2xdoc/src/main/org/apache/maven/html2xdoc Html2XdocBean.java

Reply via email to