Revision: 3889
Author: johnfargo
Date: Thu Dec 3 21:01:25 2009
Log: 1. Adds ability to override the DOMImplementation used to create a
Document in
DomParser.
2. Makes it possible to retain comments when parsing HTML.
3. Retains full document structure when String whose first tag is <html> is
parsed as a fragment
(doesn't cull out presumed artificial nodes).
This change does not add comment serialization to Nodes.render(...), by
request.
These changes are designed to accommodate Shindig's HTML parsing needs,
while maintaining Caja's existing behavior/demands as much as possible.
Comments on style, API preference, or anything else welcome.
http://code.google.com/p/google-caja/source/detail?r=3889
Modified:
/trunk/src/com/google/caja/parser/html/CajaTreeBuilder.java
/trunk/src/com/google/caja/parser/html/DomParser.java
/trunk/src/com/google/caja/parser/html/Html5ElementStack.java
/trunk/src/com/google/caja/parser/html/OpenElementStack.java
/trunk/src/com/google/caja/parser/html/XmlElementStack.java
/trunk/tests/com/google/caja/parser/html/DomParserTest.java
=======================================
--- /trunk/src/com/google/caja/parser/html/CajaTreeBuilder.java Fri Nov 13
11:43:08 2009
+++ /trunk/src/com/google/caja/parser/html/CajaTreeBuilder.java Thu Dec 3
21:01:25 2009
@@ -28,6 +28,7 @@
import java.util.Set;
import org.w3c.dom.Attr;
+import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
@@ -78,7 +79,7 @@
this.doc = doc;
this.needsDebugData = needsDebugData;
this.mq = mq;
- setIgnoringComments(true);
+ setIgnoringComments(false);
setScriptingEnabled(true); // Affects behavior of noscript
}
@@ -121,10 +122,18 @@
}
@Override
- protected void appendCommentToDocument(char[] buf, int start, int
length) {}
+ protected void appendCommentToDocument(char[] buf, int start, int
length) {
+ appendComment(doc.getDocumentElement(), buf, start, length);
+ }
@Override
- protected void appendComment(Node el, char[] buf, int start, int length)
{}
+ protected void appendComment(Node el, char[] buf, int start, int length)
{
+ Comment comment = doc.createComment(new String(buf, start, length));
+ el.appendChild(comment);
+ if (needsDebugData) {
+ Nodes.setFilePositionFor(comment, startTok.pos);
+ }
+ }
@Override
protected void appendCharacters(
=======================================
--- /trunk/src/com/google/caja/parser/html/DomParser.java Fri Nov 20
16:58:55 2009
+++ /trunk/src/com/google/caja/parser/html/DomParser.java Thu Dec 3
21:01:25 2009
@@ -60,12 +60,14 @@
*
* @author [email protected]
*/
-public final class DomParser {
+public class DomParser {
private final TokenQueue<HtmlTokenType> tokens;
private final boolean asXml;
private final MessageQueue mq;
private final Namespaces ns;
private boolean needsDebugData = true;
+ private boolean wantsComments = false;
+ private DOMImplementation domImpl = null;
public DomParser(
TokenQueue<HtmlTokenType> tokens, boolean asXml, MessageQueue mq) {
@@ -132,27 +134,44 @@
: OpenElementStack.Factory.createHtml5ElementStack(
doc, needsDebugData, mq);
}
-
+
+ public void setDomImpl(DOMImplementation domImpl) {
+ this.domImpl = domImpl;
+ }
+
+ public void setWantsComments(boolean wantsComments) {
+ this.wantsComments = wantsComments;
+ }
+
public static Document makeDocument(
- Function<DOMImplementation, DocumentType> doctypeMaker, String
features) {
+ Function<DOMImplementation, DocumentType> doctypeMaker, String
features,
+ DOMImplementation domImpl) {
if (features == null) { features = "XML 1.0 Traversal"; }
- DOMImplementation impl;
- try {
- impl = DOMImplementationRegistry.newInstance()
- .getDOMImplementation(features);
- } catch (ClassNotFoundException ex) {
- throw new RuntimeException(
- "Missing DOM implementation. Is Xerces on the classpath?", ex);
- } catch (IllegalAccessException ex) {
- throw new RuntimeException(
- "Missing DOM implementation. Is Xerces on the classpath?", ex);
- } catch (InstantiationException ex) {
- throw new RuntimeException(
- "Missing DOM implementation. Is Xerces on the classpath?", ex);
- }
+ if (domImpl == null) {
+ try {
+ domImpl = DOMImplementationRegistry.newInstance()
+ .getDOMImplementation(features);
+ } catch (ClassNotFoundException ex) {
+ throw new RuntimeException(
+ "Missing DOM implementation. Is Xerces on the classpath?",
ex);
+ } catch (IllegalAccessException ex) {
+ throw new RuntimeException(
+ "Missing DOM implementation. Is Xerces on the classpath?",
ex);
+ } catch (InstantiationException ex) {
+ throw new RuntimeException(
+ "Missing DOM implementation. Is Xerces on the classpath?",
ex);
+ }
+ }
+
DocumentType doctype = doctypeMaker != null
- ? doctypeMaker.apply(impl) : null;
- return impl.createDocument(null, null, doctype);
+ ? doctypeMaker.apply(domImpl) : null;
+ return domImpl.createDocument(null, null, doctype);
+
+ }
+
+ public static Document makeDocument(
+ Function<DOMImplementation, DocumentType> doctypeMaker, String
features) {
+ return makeDocument(doctypeMaker, features, null);
}
/** Parse a document returning the document element. */
@@ -163,7 +182,7 @@
/** Parse a document returning the document element. */
public Element parseDocument(String features) throws ParseException {
Function<DOMImplementation, DocumentType> doctypeMaker = findDoctype();
- Document doc = makeDocument(doctypeMaker, features);
+ Document doc = makeDocument(doctypeMaker, features, domImpl);
OpenElementStack elementStack = makeElementStack(doc, mq);
// Make sure the elementStack is empty.
@@ -232,7 +251,7 @@
* If there is a DOCTYPE, it will be used to seed the default namespace.
*/
public DocumentFragment parseFragment() throws ParseException {
- return parseFragment(makeDocument(findDoctype(), null));
+ return parseFragment(makeDocument(findDoctype(), null, domImpl));
}
/**
@@ -249,15 +268,16 @@
elementStack.open(true);
while (!tokens.isEmpty()) {
- // Skip over top level comments, and whitespace only text nodes.
+ // Skip over top level doctypes, and whitespace only text nodes.
// Whitespace is significant for XML unless the schema specifies
// otherwise, but whitespace outside the root element is not. There
is
// one exception for whitespace preceding the prologue.
+ // Comments are ignored by the underlying TreeBuilder unless
explicitly
+ // configured otherwise.
Token<HtmlTokenType> t = tokens.peek();
switch (t.type) {
- case COMMENT:
- case DIRECTIVE: // especially DOCTYPEs
+ case DIRECTIVE: // ignore DOCTYPEs
tokens.advance();
continue;
default: break;
@@ -402,6 +422,7 @@
InputSource is, Reader in, boolean asXml) throws IOException {
return makeTokenQueue(FilePosition.startOfFile(is), in, asXml);
}
+
/**
* Creates a TokenQueue suitable for this class's parse methods.
* @param pos the position of the first character on in.
@@ -462,6 +483,9 @@
out.processText(t);
return;
case COMMENT:
+ if (wantsComments) {
+ out.processComment(t);
+ }
continue;
default:
throw new ParseException(new Message(
=======================================
--- /trunk/src/com/google/caja/parser/html/Html5ElementStack.java Fri Nov
13 11:43:08 2009
+++ /trunk/src/com/google/caja/parser/html/Html5ElementStack.java Thu Dec
3 21:01:25 2009
@@ -58,6 +58,8 @@
private final boolean needsDebugData;
private boolean isFragment;
private boolean needsNamespaceFixup;
+ private boolean topLevelHtmlFromInput = false;
+ private boolean processingFirstTag = true;
/**
* @param needsDebugData see {...@link DomParser#setNeedsDebugData(boolean)}
@@ -156,12 +158,13 @@
if (needsDebugData) {
Nodes.setFilePositionFor(result, builder.getFragmentBounds());
}
- if (!isFragment) {
+
+ final Node first = root.getFirstChild();
+
+ if (!isFragment || topLevelHtmlFromInput) {
result.appendChild(root);
return result;
}
-
- final Node first = root.getFirstChild();
// If disposing of the html, body, or head elements would lose info
don't
// do it, so look for attributes.
@@ -289,6 +292,13 @@
boolean isEndTag = CajaTreeBuilder.isEndTag(start.text);
String tagName = start.text.substring(isEndTag ? 2 : 1);
boolean isHtml = checkName(tagName);
+ if (processingFirstTag && Strings.equalsIgnoreCase("html", tagName))
{
+ // Indicate to fragment-retrieval code that the top-level
+ // <html> element came from the input, and wasn't synthesized
+ // by the underlying parser implementation.
+ topLevelHtmlFromInput = true;
+ }
+ processingFirstTag = false;
if (isHtml) { tagName = Strings.toLowerCase(tagName); }
// Intern since the TreeBuilder likes to compare strings by
reference.
tagName = tagName.intern();
@@ -345,7 +355,29 @@
throw new RuntimeException(ex);
}
}
-
+
+ /**
+ * Adds the given comment node to the DOM.
+ */
+ public void processComment(Token<HtmlTokenType> commentToken) {
+ String text = commentToken.text.substring("<!--".length(),
+ commentToken.text.lastIndexOf("--"));
+ commentToken = Token.instance(text, commentToken.type,
commentToken.pos);
+ char[] chars;
+ int n = text.length();
+ if (n <= charBuf.length) {
+ chars = charBuf;
+ text.getChars(0, n, chars, 0);
+ } else {
+ chars = text.toCharArray();
+ }
+ builder.setTokenContext(commentToken, commentToken);
+ try {
+ builder.comment(chars, n);
+ } catch (SAXException ex) {
+ throw new RuntimeException(ex);
+ }
+ }
private boolean checkName(String qname) {
if (qname.indexOf(':', 1) < 0) {
=======================================
--- /trunk/src/com/google/caja/parser/html/OpenElementStack.java Fri Nov 13
11:43:08 2009
+++ /trunk/src/com/google/caja/parser/html/OpenElementStack.java Thu Dec 3
21:01:25 2009
@@ -73,6 +73,11 @@
* Adds the given text node to the DOM.
*/
void processText(Token<HtmlTokenType> text);
+
+ /**
+ * Adds the given comment node to the DOM.
+ */
+ void processComment(Token<HtmlTokenType> comment);
/**
* Called before parsing starts.
=======================================
--- /trunk/src/com/google/caja/parser/html/XmlElementStack.java Fri Nov 13
11:43:08 2009
+++ /trunk/src/com/google/caja/parser/html/XmlElementStack.java Thu Dec 3
21:01:25 2009
@@ -27,6 +27,7 @@
import java.util.List;
import org.w3c.dom.Attr;
+import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
@@ -171,6 +172,19 @@
}
doAppend(textNode, parent);
}
+
+ /**
+ * Adds the given comment node to the DOM.
+ */
+ public void processComment(Token<HtmlTokenType> commentToken) {
+ String text = commentToken.text.substring("<!--".length(),
+ commentToken.text.lastIndexOf("--"));
+ Comment comment = doc.createComment(text);
+ if (needsDebugData) {
+ Nodes.setFilePositionFor(comment, commentToken.pos);
+ }
+ doAppend(comment, getBottom().n);
+ }
/** {...@inheritdoc} */
public void finish(FilePosition endOfDocument)
=======================================
--- /trunk/tests/com/google/caja/parser/html/DomParserTest.java Fri Nov 20
16:58:55 2009
+++ /trunk/tests/com/google/caja/parser/html/DomParserTest.java Thu Dec 3
21:01:25 2009
@@ -533,7 +533,68 @@
)
);
}
-
+
+ public final void testFragmentThatEndsWithACommentRetained()
+ throws Exception {
+ assertParsedHtmlFragmentWithComments(
+ Arrays.asList(
+ "<p>Hello</p> <!-- Zoicks --> "),
+ Arrays.asList(
+ "Fragment 1+1-1+33",
+ " Element : p 1+1-1+13",
+ " Text : Hello 1+4-1+9",
+ " Text : 1+13-1+15",
+ " Comment : Zoicks 1+15-1+30",
+ " Text : 1+30-1+33"
+ ),
+ Arrays.<String>asList(
+ ),
+ Arrays.asList(
+ // Parses as comment, but comment is suppressed by
Nodes.render()
+ "<p>Hello</p> "
+ )
+ );
+ }
+
+ public final void testFragmentWithTopLevelHtmlNodeRetained()
+ throws Exception {
+ assertParsedHtmlFragmentWithComments(
+ Arrays.asList(
+ "<html>",
+ "<head><script>foo</script></head>",
+ "<!-- Above body -->",
+ "<body>",
+ " <!-- In body -->",
+ "</body>",
+ "</html>"),
+ Arrays.asList(
+ "Fragment 1+1-7+8",
+ " Element : html 1+1-7+8",
+ " Text : \\n 1+7-2+1",
+ " Element : head 2+1-2+34",
+ " Element : script 2+7-2+27",
+ " Text : foo 2+15-2+18",
+ " Text : \\n 2+34-3+1",
+ " Comment : Above body 3+1-3+20",
+ " Text : \\n 3+20-4+1",
+ " Element : body 4+1-6+8",
+ " Text : \\n 4+7-5+3",
+ " Comment : In body 5+3-5+19",
+ " Text : \\n\\n 5+19-7+1"),
+ Arrays.<String>asList(
+ ),
+ Arrays.asList(
+ // Again comment is parsed but suppressed for now at output.
+ "<html>",
+ "<head><script>foo</script></head>",
+ "",
+ "<body>",
+ " ",
+ "",
+ "</body></html>")
+ );
+ }
+
public final void testTableFragment() throws Exception {
assertParsedHtmlFragment(
Arrays.asList(
@@ -2195,7 +2256,17 @@
assertParsedMarkup(htmlInput, expectedParseTree, expectedMessages,
expectedOutputHtml, false, true);
}
-
+
+ private void assertParsedHtmlFragmentWithComments(
+ List<String> htmlInput,
+ List<String> expectedParseTree,
+ List<String> expectedMessages,
+ List<String> expectedOutputHtml)
+ throws ParseException {
+ assertParsedMarkup(htmlInput, expectedParseTree, expectedMessages,
+ expectedOutputHtml, false, true, true);
+ }
+
private void assertParsedMarkup(
List<String> htmlInput,
List<String> expectedParseTree,
@@ -2204,6 +2275,19 @@
Boolean asXml,
boolean fragment)
throws ParseException {
+ assertParsedMarkup(htmlInput, expectedParseTree, expectedMessages,
+ expectedOutputHtml, asXml, fragment, false);
+ }
+
+ private void assertParsedMarkup(
+ List<String> htmlInput,
+ List<String> expectedParseTree,
+ List<String> expectedMessages,
+ List<String> expectedOutputHtml,
+ Boolean asXml,
+ boolean fragment,
+ boolean wantsComments)
+ throws ParseException {
System.err.println("\n\nStarting " + getName()
+ "\n===================");
mq.getMessages().clear();
@@ -2218,6 +2302,7 @@
p = new DomParser(lexer, is, mq);
asXml = lexer.getTreatedAsXml();
}
+ p.setWantsComments(wantsComments);
Node tree = fragment ? p.parseFragment() : p.parseDocument();
List<String> actualParseTree = formatLines(tree);
@@ -2247,6 +2332,7 @@
Join.join("\n", htmlInput), asXml);
DomParser noDebugParser = new DomParser(
tq, p.asXml(), DevNullMessageQueue.singleton());
+ noDebugParser.setWantsComments(wantsComments);
treeWithoutDebugData = fragment
? noDebugParser.parseFragment()
: noDebugParser.parseDocument();
@@ -2345,6 +2431,11 @@
formatValue(node.getNodeValue());
formatPosition(Nodes.getFilePositionFor(node));
break;
+ case Node.COMMENT_NODE:
+ out.append("Comment : ");
+ formatValue(node.getNodeValue());
+ formatPosition(Nodes.getFilePositionFor(node));
+ break;
default:
out.append(node.getNodeName());
formatPosition(Nodes.getFilePositionFor(node));