dion 2003/08/04 07:12:58
Modified: src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc
TestHtml2Xdoc.java
src/plugins-build/html2xdoc/src/main/org/apache/maven/html2xdoc
Html2XdocBean.java
Added: src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc
h3h4.html h1h2.html comment.html comment.xml
h1h2.xml link.xml link.html
Log:
Tests pass and it appears to work.
Applied patch for MAVEN-550
Revision Changes Path
1.2 +4 -0
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/TestHtml2Xdoc.java
Index: TestHtml2Xdoc.java
===================================================================
RCS file:
/home/cvs/maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/TestHtml2Xdoc.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- TestHtml2Xdoc.java 6 Mar 2003 19:15:10 -0000 1.1
+++ TestHtml2Xdoc.java 4 Aug 2003 14:12:58 -0000 1.2
@@ -108,6 +108,10 @@
//-------------------------------------------------------------------------
public void testOne() throws Exception {
assertConversion("input1.html", "output1.xml");
+ assertConversion("h1h2.html", "h1h2.xml");
+ assertConversion("h3h4.html", "h1h2.xml");
+ assertConversion("link.html", "link.xml");
+ assertConversion("comment.html", "comment.xml");
}
// Implementation methods
1.1
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/h3h4.html
Index: h3h4.html
===================================================================
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
<title>A title</title>
</head>
<body>
<h3>A section title</h3>
Some text
<br/>
More text
<h4>a subsection</h4>
This is a subsection. It only has this.
</body>
</html>
1.1
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/h1h2.html
Index: h1h2.html
===================================================================
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
<title>A title</title>
</head>
<body>
<h1>A section title</h1>
Some text
<br/>
More text
<h2>a subsection</h2>
This is a subsection. It only has this.
</body>
</html>
1.1
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/comment.html
Index: comment.html
===================================================================
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
<title>A title</title>
</head>
<body>
<h1>A section title</h1>
<!-- #aComment entry -->
Some text with a <a href="somewhere">link</a>.
</body>
</html>
1.1
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/comment.xml
Index: comment.xml
===================================================================
<document>
<properties>
<title>A title</title>
</properties>
<body>
<section name="A section title">
<!-- #aComment entry -->
<p>Some text with a <a href="somewhere">link</a>.</p>
</section>
</body>
</document>
1.1
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/h1h2.xml
Index: h1h2.xml
===================================================================
<document>
<properties>
<title>A title</title>
</properties>
<body>
<section name="A section title">
<p>Some text
</p>
<br/>
<p>More text
</p>
<subsection name="a subsection">
<p>This is a subsection. It only has this.
</p>
</subsection>
</section>
</body>
</document>
1.1
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/link.xml
Index: link.xml
===================================================================
<document>
<properties>
<title>A title</title>
</properties>
<body>
<section name="A section title">
<p>Some text with a <a href="somewhere">link</a>.</p>
</section>
</body>
</document>
1.1
maven/src/plugins-build/html2xdoc/src/test/org/apache/maven/html2xdoc/link.html
Index: link.html
===================================================================
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
<title>A title</title>
</head>
<body>
<h1>A section title</h1>
Some text with a <a href="somewhere">link</a>.
</body>
</html>
1.2 +264 -110
maven/src/plugins-build/html2xdoc/src/main/org/apache/maven/html2xdoc/Html2XdocBean.java
Index: Html2XdocBean.java
===================================================================
RCS file:
/home/cvs/maven/src/plugins-build/html2xdoc/src/main/org/apache/maven/html2xdoc/Html2XdocBean.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- Html2XdocBean.java 6 Mar 2003 19:15:11 -0000 1.1
+++ Html2XdocBean.java 4 Aug 2003 14:12:58 -0000 1.2
@@ -62,7 +62,6 @@
package org.apache.maven.html2xdoc;
-import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
@@ -70,140 +69,289 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.CharacterData;
+import org.dom4j.Comment;
import org.dom4j.Document;
import org.dom4j.DocumentFactory;
import org.dom4j.Element;
import org.dom4j.Node;
/**
- * A simple bean for converting a HTML document into an XDoc compliant XML document.
+ * A simple bean for converting a HTML document into an XDoc compliant XML
+ * document.
* This could be done via XSLT but is a little more complex than it might first
* appear so its done via Java code instead.
*
* @author <a href="mailto:[EMAIL PROTECTED]">James Strachan</a>
*/
public class Html2XdocBean {
-
+
/** The Log to which logging calls will be made. */
private static final Log log = LogFactory.getLog(Html2XdocBean.class);
-
- private DocumentFactory factory = new DocumentFactory();
- /**
- * Converts the given HTML document into the corresponding XDoc format
- * of XML
- *
- * @param html
- * @return Document
- */
- public Document convert(Document html) {
- Document doc = factory.createDocument();
- Element root = doc.addElement("document");
- Element properties = root.addElement("properties");
- Element title = properties.addElement("title");
- title.setText(html.valueOf("/html/head/title"));
-
- Element body = root.addElement("body");
-
- Element htmlContent = (Element) html.selectSingleNode("/html/body");
+ /**
+ * Used to create the output document
+ */
+ private DocumentFactory factory = new DocumentFactory();
+
+ /**
+ * The current node to attach the sub-nodes.
+ */
+ private Element currentNode = null;
+
+ /**
+ * The current 'root' section node. This is used to keep
+ * track of the root section so that when a subsection is
+ * found it can be associated correctly.
+ */
+ private Element currentSectionNode = null;
+
+ /**
+ * The current section heading level. If a subsequent level
+ * lower or equal, then create a new section.
+ */
+ private int currentSectionHeadingLevel = Integer.MIN_VALUE;
+
+ /**
+ * The current paragraph node. This is used to associate text
+ * and formatting nodes to the current paragraph node.
+ */
+ private Element currentParaNode = null;
+
+ /**
+ * Converts the given HTML document into the corresponding XDoc format
+ * of XML
+ *
+ * @param html the input html document
+ * @return Document
+ */
+ public Document convert(Document html) {
+ Document doc = factory.createDocument();
+ Element root = doc.addElement("document");
+ Element properties = root.addElement("properties");
+ Element title = properties.addElement("title");
+ title.setText(html.valueOf("/html/head/title"));
+
+ Element body = root.addElement("body");
+
+ Element htmlContent = (Element) html.selectSingleNode("/html/body");
if (htmlContent == null) {
- log.info("No body element found for HTML document: " + html.asXML());
- }
- else {
+ log.info("No body element found for HTML document: "
+ + html.asXML());
+ } else {
addSections(body, htmlContent);
}
- return doc;
- }
-
- /**
- * Iterates thorugh the given body looking for h1, h2, h3 nodes and
- * creating the associated section elements. Any text nodes
- * contained inside the body are wrapped in a <p> element
- *
- * @param output the output destination
- * @param body the block of HTML markup to convert
- */
- protected void addSections(Element output, Element body) {
- List content = getBodyContent(body.content());
- Element section = null;
- Element p = null;
-
- for (Iterator iter = content.iterator(); iter.hasNext(); ) {
- Node node = (Node) iter.next();
+ return doc;
+ }
- String name = node.getName();
- if (name != null && name.startsWith("h")) {
- /** @todo we should handle child headings as a nested section */
- section = output.addElement("section");
- section.addAttribute("name", node.getText());
- p = null;
- }
- else {
- if (section == null ) {
- // we have a section with no name
- // should we default it to be the same as the document
title?
- section = output.addElement("section");
- }
-
- if (node instanceof CharacterData) {
- // lets add a <p>
- if (p == null) {
- p = section.addElement("p");
- }
- p.addText( node.getText() );
- }
- else {
- section.add(cloneNode(node));
- p = null;
- }
+ /**
+ * Iterates thorugh the given body looking for h1, h2, h3 nodes and
+ * creating the associated section elements. Any text nodes
+ * contained inside the body are wrapped in a <p> element
+ *
+ * @param output the output destination
+ * @param body the block of HTML markup to convert
+ */
+ protected void addSections(Element output, Element body) {
+ List content = getBodyContent(body.content());
+
+ for (Iterator iter = content.iterator(); iter.hasNext();) {
+ Node node = (Node) iter.next();
+ if (isHeading(node)) {
+ makeSection(output, node);
+ } else {
+ guaranteeHasSection(output);
+ processNode(node);
}
- }
- }
-
- /**
- * @param node
- * @return true if the given node is a heading element (h1, h2, h3 etc)
- */
- protected boolean isHeading(Node node) {
- String name = node.getName();
- return name != null && name.startsWith("h");
- }
-
- /**
- * Returns a copy of the body content, removing any whitespace from the
beginning and end
- * @param body
- * @return List
- */
+ }
+
+ }
+
+ /**
+ * main algorithm which represents the iteration contract.
+ * Use the protected methods to change the behavior.
+ *
+ * @param node the node to process
+ */
+ private void processNode(Node node) {
+ if (isCharacterData(node)) {
+ addTextNode(node);
+ } else if (isTextFormatting(node)) {
+ addFormattingNode(node);
+ } else {
+ addNode(node);
+ }
+ }
+
+ /**
+ * Specifies whether the node is a text modifying construct that should be
+ * passed as is to the resultant html. Such as an anchor '<a>'.
+ *
+ * @param node the node to check
+ * @return true if the node is used to modify the formatting of the
+ * text; otherwise, false
+ */
+ protected boolean isTextFormatting(Node node) {
+ // Ultimately this needs bold, italic, and so on
+ return node.getName() != null && node.getName().equals("a");
+ }
+
+ /**
+ * Specifies whether the node is character data and should be passed as
+ * straight text to the resultant html.
+ *
+ * @param node the node to check
+ * @return true if the node is a text node; otherwise, false.
+ */
+ protected boolean isCharacterData(Node node) {
+ return node instanceof CharacterData
+ && (node instanceof Comment) == false;
+ }
+
+ /**
+ * Specifies whether the node is a heading node.
+ *
+ * @param node the node to check
+ * @return true if the given node is a heading element
+ * (h1, h2, h3 etc); otherwise, false
+ */
+ protected boolean isHeading(Node node) {
+ String name = node.getName();
+ return name != null && name.startsWith("h");
+ }
+
+ /**
+ * Determines the heading level of the node.
+ *
+ * @param node the node to check
+ * @return the integer level of the heading
+ */
+ protected int determineHeadingLevel(Node node) {
+ try {
+ String name = node.getName().substring(1);
+ return Integer.parseInt(name);
+ } catch (NumberFormatException nfe) {
+ return 1;
+ }
+ }
+
+ /**
+ * Creates a section or subsection as necessary based on the node
+ * for the output document.
+ *
+ * @param output the output document to attach the section
+ * @param node the node to base making a section on
+ */
+ protected void makeSection(Element output, Node node) {
+ int level = determineHeadingLevel(node);
+ if (needsNewSection(node)) {
+ currentNode = output.addElement("section");
+ currentSectionHeadingLevel = level;
+ currentSectionNode = currentNode;
+ } else {
+ currentNode = currentSectionNode.addElement("subsection");
+ }
+ currentNode.addAttribute("name", node.getText());
+ currentParaNode = null;
+ }
+
+ /**
+ * Determines if a new section is needed which is based on whether
+ * the node's a heading level and equal to or less than the current
+ * section's heading level.
+ *
+ * @param node the node to check
+ * @return true if the current node's information means for a new
+ * section; otherwise, false
+ */
+ protected boolean needsNewSection(Node node) {
+ int level = determineHeadingLevel(node);
+ return level <= currentSectionHeadingLevel
+ || currentSectionNode == null;
+ }
+
+ /**
+ * Determines if a paragraph node is needed.
+ */
+ private void guaranteeHasParaNode() {
+ if (currentParaNode == null) {
+ currentParaNode = currentNode.addElement("p");
+ }
+ }
+
+ /**
+ * Makes sure the current node is section, if necessary.
+ * @param output the output element to add the section to
+ */
+ private void guaranteeHasSection(Element output) {
+ if (currentNode == null) {
+ // we have a section with no name
+ // should we default it to be the same as the document title?
+ currentNode = output.addElement("section");
+ }
+ }
+
+ /**
+ * Add the node to the current node.
+ * @param node the node to add
+ */
+ private void addNode(Node node) {
+ currentNode.add(cloneNode(node));
+ currentParaNode = null;
+ }
+
+ /**
+ * Adds the text of the node to the current paragraph.
+ * @param node the node to add
+ */
+ private void addTextNode(Node node) {
+ guaranteeHasParaNode();
+ currentParaNode.addText(node.getText());
+ }
+
+ /**
+ * Adds the node to the current paragraph.
+ * @param node the node to add
+ */
+ private void addFormattingNode(Node node) {
+ guaranteeHasParaNode();
+ currentParaNode.add(cloneNode(node));
+ }
+
+ /**
+ * Returns a copy of the body content, removing any whitespace from
+ * the beginning and end.
+ *
+ * @param content the content node list to obtain body content from
+ * @return List
+ */
protected List getBodyContent(List content) {
// lets turn <pre> into <source> and concatenate consective entries
Element lastPre = null;
- LinkedList list = new LinkedList();
+ LinkedList list = new LinkedList();
boolean lastWasElement = true;
- for (Iterator iter = content.iterator(); iter.hasNext(); ) {
+ for (Iterator iter = content.iterator(); iter.hasNext();) {
Node node = (Node) iter.next();
-
+
if (isPre(node)) {
if (lastPre == null) {
lastPre = factory.createElement("source");
list.add(lastPre);
}
lastPre.addText(node.getText());
- }
- else {
+ } else {
if (isWhitespace(node) && lastWasElement) {
if (lastPre != null) {
lastPre.addText(node.getText());
}
- }
- else {
+ } else {
lastWasElement = node instanceof Element;
if (lastWasElement) {
- lastPre = null;
- }
- list.add(node);
+ lastPre = null;
+ }
+ list.add(node);
}
}
- }
+ }
// now lets remove any whitespace text nodes at the beginning and end
while (true) {
@@ -224,7 +372,11 @@
}
return list;
}
-
+
+ /**
+ * @param node the node to check
+ * @return true if the node is a pre tag; otherwise false.
+ */
protected boolean isPre(Node node) {
if (node instanceof Element) {
Element element = (Element) node;
@@ -234,6 +386,7 @@
}
/**
+ * @param node the node to check
* @return true if the given node is a whitespace text node
*/
protected boolean isWhitespace(Node node) {
@@ -242,30 +395,31 @@
return text.trim().length() <= 0;
}
if (node instanceof Element) {
- String name = node.getName();
- if (name.equals("p")) {
+ String name = node.getName();
+ if (name.equals("p")) {
String text = node.getText();
return text.trim().length() <= 0;
- }
- if (name.equals("br")) {
- return true;
- }
+ }
+ if (name.equals("br")) {
+ return true;
+ }
}
return false;
}
- /**
- * Normalizes the whitespace of any Elements
- * @param node
- * @return Node
- */
+ /**
+ * Normalizes the whitespace of any Elements
+ *
+ * @param node the node to clone
+ * @return Node the cloned node
+ */
protected Node cloneNode(Node node) {
- Node answer = (Node) node.clone();
+ Node answer = (Node) node.clone();
if (answer instanceof Element) {
Element element = (Element) answer;
element.normalize();
}
return answer;
}
-
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]