[Caja] [google-caja] r3889 committed - 1. Adds ability to override the DOMImplementation used to create a Doc...

google-caja Thu, 03 Dec 2009 21:02:01 -0800

Revision: 3889
Author: johnfargo
Date: Thu Dec  3 21:01:25 2009

Log: 1. Adds ability to override the DOMImplementation used to create aDocument in

DomParser.


2. Makes it possible to retain comments when parsing HTML.

3. Retains full document structure when String whose first tag is <html> isparsed as a fragment

(doesn't cull out presumed artificial nodes).

This change does not add comment serialization to Nodes.render(...), byrequest.


These changes are designed to accommodate Shindig's HTML parsing needs,
while maintaining Caja's existing behavior/demands as much as possible.

Comments on style, API preference, or anything else welcome.


http://code.google.com/p/google-caja/source/detail?r=3889

Modified:
 /trunk/src/com/google/caja/parser/html/CajaTreeBuilder.java
 /trunk/src/com/google/caja/parser/html/DomParser.java
 /trunk/src/com/google/caja/parser/html/Html5ElementStack.java
 /trunk/src/com/google/caja/parser/html/OpenElementStack.java
 /trunk/src/com/google/caja/parser/html/XmlElementStack.java
 /trunk/tests/com/google/caja/parser/html/DomParserTest.java

=======================================

--- /trunk/src/com/google/caja/parser/html/CajaTreeBuilder.java Fri Nov 1311:43:08 2009+++ /trunk/src/com/google/caja/parser/html/CajaTreeBuilder.java Thu Dec 321:01:25 2009

@@ -28,6 +28,7 @@
 import java.util.Set;

 import org.w3c.dom.Attr;
+import org.w3c.dom.Comment;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
@@ -78,7 +79,7 @@
     this.doc = doc;
     this.needsDebugData = needsDebugData;
     this.mq = mq;
-    setIgnoringComments(true);
+    setIgnoringComments(false);
     setScriptingEnabled(true);  // Affects behavior of noscript
   }

@@ -121,10 +122,18 @@
   }

   @Override

- protected void appendCommentToDocument(char[] buf, int start, intlength) {}+ protected void appendCommentToDocument(char[] buf, int start, intlength) {

+    appendComment(doc.getDocumentElement(), buf, start, length);
+  }

   @Override

- protected void appendComment(Node el, char[] buf, int start, int length){}+ protected void appendComment(Node el, char[] buf, int start, int length){

+    Comment comment = doc.createComment(new String(buf, start, length));
+    el.appendChild(comment);
+    if (needsDebugData) {
+      Nodes.setFilePositionFor(comment, startTok.pos);
+    }
+  }

   @Override
   protected void appendCharacters(
=======================================

--- /trunk/src/com/google/caja/parser/html/DomParser.java Fri Nov 2016:58:55 2009+++ /trunk/src/com/google/caja/parser/html/DomParser.java Thu Dec 321:01:25 2009

@@ -60,12 +60,14 @@
  *
  * @author [email protected]
  */
-public final class DomParser {
+public class DomParser {
   private final TokenQueue<HtmlTokenType> tokens;
   private final boolean asXml;
   private final MessageQueue mq;
   private final Namespaces ns;
   private boolean needsDebugData = true;
+  private boolean wantsComments = false;
+  private DOMImplementation domImpl = null;

   public DomParser(
       TokenQueue<HtmlTokenType> tokens, boolean asXml, MessageQueue mq) {
@@ -132,27 +134,44 @@
         : OpenElementStack.Factory.createHtml5ElementStack(
             doc, needsDebugData, mq);
   }
-
+
+  public void setDomImpl(DOMImplementation domImpl) {
+    this.domImpl = domImpl;
+  }
+
+  public void setWantsComments(boolean wantsComments) {
+    this.wantsComments = wantsComments;
+  }
+
   public static Document makeDocument(

- Function<DOMImplementation, DocumentType> doctypeMaker, Stringfeatures) {+ Function<DOMImplementation, DocumentType> doctypeMaker, Stringfeatures,

+      DOMImplementation domImpl) {
     if (features == null) { features = "XML 1.0 Traversal"; }
-    DOMImplementation impl;
-    try {
-      impl = DOMImplementationRegistry.newInstance()
-          .getDOMImplementation(features);
-    } catch (ClassNotFoundException ex) {
-      throw new RuntimeException(
-          "Missing DOM implementation.  Is Xerces on the classpath?", ex);
-    } catch (IllegalAccessException ex) {
-      throw new RuntimeException(
-          "Missing DOM implementation.  Is Xerces on the classpath?", ex);
-    } catch (InstantiationException ex) {
-      throw new RuntimeException(
-          "Missing DOM implementation.  Is Xerces on the classpath?", ex);
-    }
+    if (domImpl == null) {
+      try {
+        domImpl = DOMImplementationRegistry.newInstance()
+            .getDOMImplementation(features);
+      } catch (ClassNotFoundException ex) {
+        throw new RuntimeException(

+ "Missing DOM implementation. Is Xerces on the classpath?",ex);

+      } catch (IllegalAccessException ex) {
+        throw new RuntimeException(

+ "Missing DOM implementation. Is Xerces on the classpath?",ex);

+      } catch (InstantiationException ex) {
+        throw new RuntimeException(

+ "Missing DOM implementation. Is Xerces on the classpath?",ex);

+      }
+    }
+
     DocumentType doctype = doctypeMaker != null
-        ? doctypeMaker.apply(impl) : null;
-    return impl.createDocument(null, null, doctype);
+        ? doctypeMaker.apply(domImpl) : null;
+    return domImpl.createDocument(null, null, doctype);
+
+  }
+
+  public static Document makeDocument(

+ Function<DOMImplementation, DocumentType> doctypeMaker, Stringfeatures) {

+    return makeDocument(doctypeMaker, features, null);
   }

   /** Parse a document returning the document element. */
@@ -163,7 +182,7 @@
   /** Parse a document returning the document element. */
   public Element parseDocument(String features) throws ParseException {
     Function<DOMImplementation, DocumentType> doctypeMaker = findDoctype();
-    Document doc = makeDocument(doctypeMaker, features);
+    Document doc = makeDocument(doctypeMaker, features, domImpl);
     OpenElementStack elementStack = makeElementStack(doc, mq);

     // Make sure the elementStack is empty.
@@ -232,7 +251,7 @@
    * If there is a DOCTYPE, it will be used to seed the default namespace.
    */
   public DocumentFragment parseFragment() throws ParseException {
-    return parseFragment(makeDocument(findDoctype(), null));
+    return parseFragment(makeDocument(findDoctype(), null, domImpl));
   }

   /**
@@ -249,15 +268,16 @@
     elementStack.open(true);

     while (!tokens.isEmpty()) {
-      // Skip over top level comments, and whitespace only text nodes.
+      // Skip over top level doctypes, and whitespace only text nodes.
       // Whitespace is significant for XML unless the schema specifies

// otherwise, but whitespace outside the root element is not. Thereis

       // one exception for whitespace preceding the prologue.

+ // Comments are ignored by the underlying TreeBuilder unlessexplicitly

+      // configured otherwise.
       Token<HtmlTokenType> t = tokens.peek();

       switch (t.type) {
-        case COMMENT:
-        case DIRECTIVE:  // especially DOCTYPEs
+        case DIRECTIVE:  // ignore DOCTYPEs
           tokens.advance();
           continue;
         default: break;
@@ -402,6 +422,7 @@
       InputSource is, Reader in, boolean asXml) throws IOException {
     return makeTokenQueue(FilePosition.startOfFile(is), in, asXml);
   }
+
   /**
    * Creates a TokenQueue suitable for this class's parse methods.
    * @param pos the position of the first character on in.
@@ -462,6 +483,9 @@
           out.processText(t);
           return;
         case COMMENT:
+          if (wantsComments) {
+            out.processComment(t);
+          }
           continue;
         default:
           throw new ParseException(new Message(
=======================================

--- /trunk/src/com/google/caja/parser/html/Html5ElementStack.java Fri Nov13 11:43:08 2009+++ /trunk/src/com/google/caja/parser/html/Html5ElementStack.java Thu Dec3 21:01:25 2009

@@ -58,6 +58,8 @@
   private final boolean needsDebugData;
   private boolean isFragment;
   private boolean needsNamespaceFixup;
+  private boolean topLevelHtmlFromInput = false;
+  private boolean processingFirstTag = true;

   /**
    * @param needsDebugData see {...@link DomParser#setNeedsDebugData(boolean)}
@@ -156,12 +158,13 @@
     if (needsDebugData) {
       Nodes.setFilePositionFor(result, builder.getFragmentBounds());
     }
-    if (!isFragment) {
+
+    final Node first = root.getFirstChild();
+
+    if (!isFragment || topLevelHtmlFromInput) {
       result.appendChild(root);
       return result;
     }
-
-    final Node first = root.getFirstChild();

// If disposing of the html, body, or head elements would lose infodon't

     // do it, so look for attributes.
@@ -289,6 +292,13 @@
       boolean isEndTag = CajaTreeBuilder.isEndTag(start.text);
       String tagName = start.text.substring(isEndTag ? 2 : 1);
       boolean isHtml = checkName(tagName);

+ if (processingFirstTag && Strings.equalsIgnoreCase("html", tagName)){

+        // Indicate to fragment-retrieval code that the top-level
+        // <html> element came from the input, and wasn't synthesized
+        // by the underlying parser implementation.
+        topLevelHtmlFromInput = true;
+      }
+      processingFirstTag = false;
       if (isHtml) { tagName = Strings.toLowerCase(tagName); }

// Intern since the TreeBuilder likes to compare strings byreference.

       tagName = tagName.intern();
@@ -345,7 +355,29 @@
       throw new RuntimeException(ex);
     }
   }
-
+
+  /**
+   * Adds the given comment node to the DOM.
+   */
+  public void processComment(Token<HtmlTokenType> commentToken) {
+    String text = commentToken.text.substring("<!--".length(),
+        commentToken.text.lastIndexOf("--"));

+ commentToken = Token.instance(text, commentToken.type,commentToken.pos);

+    char[] chars;
+    int n = text.length();
+    if (n <= charBuf.length) {
+      chars = charBuf;
+      text.getChars(0, n, chars, 0);
+    } else {
+      chars = text.toCharArray();
+    }
+    builder.setTokenContext(commentToken, commentToken);
+    try {
+      builder.comment(chars, n);
+    } catch (SAXException ex) {
+      throw new RuntimeException(ex);
+    }
+  }

   private boolean checkName(String qname) {
     if (qname.indexOf(':', 1) < 0) {
=======================================

--- /trunk/src/com/google/caja/parser/html/OpenElementStack.java Fri Nov 1311:43:08 2009+++ /trunk/src/com/google/caja/parser/html/OpenElementStack.java Thu Dec 321:01:25 2009

@@ -73,6 +73,11 @@
    * Adds the given text node to the DOM.
    */
   void processText(Token<HtmlTokenType> text);
+
+  /**
+   * Adds the given comment node to the DOM.
+   */
+  void processComment(Token<HtmlTokenType> comment);

   /**
    * Called before parsing starts.
=======================================

--- /trunk/src/com/google/caja/parser/html/XmlElementStack.java Fri Nov 1311:43:08 2009+++ /trunk/src/com/google/caja/parser/html/XmlElementStack.java Thu Dec 321:01:25 2009

@@ -27,6 +27,7 @@
 import java.util.List;

 import org.w3c.dom.Attr;
+import org.w3c.dom.Comment;
 import org.w3c.dom.Document;
 import org.w3c.dom.DocumentFragment;
 import org.w3c.dom.Element;
@@ -171,6 +172,19 @@
     }
     doAppend(textNode, parent);
   }
+
+  /**
+   * Adds the given comment node to the DOM.
+   */
+  public void processComment(Token<HtmlTokenType> commentToken) {
+    String text = commentToken.text.substring("<!--".length(),
+        commentToken.text.lastIndexOf("--"));
+    Comment comment = doc.createComment(text);
+    if (needsDebugData) {
+      Nodes.setFilePositionFor(comment, commentToken.pos);
+    }
+    doAppend(comment, getBottom().n);
+  }

   /** {...@inheritdoc} */
   public void finish(FilePosition endOfDocument)
=======================================

--- /trunk/tests/com/google/caja/parser/html/DomParserTest.java Fri Nov 2016:58:55 2009+++ /trunk/tests/com/google/caja/parser/html/DomParserTest.java Thu Dec 321:01:25 2009

@@ -533,7 +533,68 @@
             )
         );
   }
-
+
+  public final void testFragmentThatEndsWithACommentRetained()
+      throws Exception {
+    assertParsedHtmlFragmentWithComments(
+        Arrays.asList(
+            "<p>Hello</p>  <!-- Zoicks -->   "),
+        Arrays.asList(
+            "Fragment 1+1-1+33",
+            "  Element : p 1+1-1+13",
+            "    Text : Hello 1+4-1+9",
+            "  Text :    1+13-1+15",
+            "  Comment :  Zoicks  1+15-1+30",
+            "  Text :     1+30-1+33"
+            ),
+        Arrays.<String>asList(
+            ),
+        Arrays.asList(

+ // Parses as comment, but comment is suppressed byNodes.render()

+            "<p>Hello</p>     "
+            )
+        );
+  }
+
+  public final void testFragmentWithTopLevelHtmlNodeRetained()
+      throws Exception {
+    assertParsedHtmlFragmentWithComments(
+        Arrays.asList(
+            "<html>",
+            "<head><script>foo</script></head>",
+            "<!-- Above body -->",
+            "<body>",
+            "  <!-- In body -->",
+            "</body>",
+            "</html>"),
+        Arrays.asList(
+            "Fragment 1+1-7+8",
+            "  Element : html 1+1-7+8",
+            "    Text : \\n 1+7-2+1",
+            "    Element : head 2+1-2+34",
+            "      Element : script 2+7-2+27",
+            "        Text : foo 2+15-2+18",
+            "    Text : \\n 2+34-3+1",
+            "    Comment :  Above body  3+1-3+20",
+            "    Text : \\n 3+20-4+1",
+            "    Element : body 4+1-6+8",
+            "      Text : \\n   4+7-5+3",
+            "      Comment :  In body  5+3-5+19",
+            "      Text : \\n\\n 5+19-7+1"),
+        Arrays.<String>asList(
+            ),
+        Arrays.asList(
+            // Again comment is parsed but suppressed for now at output.
+            "<html>",
+            "<head><script>foo</script></head>",
+            "",
+            "<body>",
+            "  ",
+            "",
+            "</body></html>")
+        );
+  }
+
   public final void testTableFragment() throws Exception {
     assertParsedHtmlFragment(
         Arrays.asList(
@@ -2195,7 +2256,17 @@
     assertParsedMarkup(htmlInput, expectedParseTree, expectedMessages,
                        expectedOutputHtml, false, true);
   }
-
+
+  private void assertParsedHtmlFragmentWithComments(
+      List<String> htmlInput,
+      List<String> expectedParseTree,
+      List<String> expectedMessages,
+      List<String> expectedOutputHtml)
+      throws ParseException {
+    assertParsedMarkup(htmlInput, expectedParseTree, expectedMessages,
+                       expectedOutputHtml, false, true, true);
+  }
+
   private void assertParsedMarkup(
       List<String> htmlInput,
       List<String> expectedParseTree,
@@ -2204,6 +2275,19 @@
       Boolean asXml,
       boolean fragment)
       throws ParseException {
+    assertParsedMarkup(htmlInput, expectedParseTree, expectedMessages,
+                       expectedOutputHtml, asXml, fragment, false);
+  }
+
+  private void assertParsedMarkup(
+      List<String> htmlInput,
+      List<String> expectedParseTree,
+      List<String> expectedMessages,
+      List<String> expectedOutputHtml,
+      Boolean asXml,
+      boolean fragment,
+      boolean wantsComments)
+      throws ParseException {

System.err.println("\n\nStarting " + getName()+ "\n===================");

     mq.getMessages().clear();
@@ -2218,6 +2302,7 @@
       p = new DomParser(lexer, is, mq);
       asXml = lexer.getTreatedAsXml();
     }
+    p.setWantsComments(wantsComments);
     Node tree = fragment ? p.parseFragment() : p.parseDocument();

     List<String> actualParseTree = formatLines(tree);
@@ -2247,6 +2332,7 @@
           Join.join("\n", htmlInput), asXml);
       DomParser noDebugParser = new DomParser(
           tq, p.asXml(), DevNullMessageQueue.singleton());
+      noDebugParser.setWantsComments(wantsComments);
       treeWithoutDebugData = fragment
           ? noDebugParser.parseFragment()
           : noDebugParser.parseDocument();
@@ -2345,6 +2431,11 @@
           formatValue(node.getNodeValue());
           formatPosition(Nodes.getFilePositionFor(node));
           break;
+        case Node.COMMENT_NODE:
+          out.append("Comment : ");
+          formatValue(node.getNodeValue());
+          formatPosition(Nodes.getFilePositionFor(node));
+          break;
         default:
           out.append(node.getNodeName());
           formatPosition(Nodes.getFilePositionFor(node));

[Caja] [google-caja] r3889 committed - 1. Adds ability to override the DOMImplementation used to create a Doc...

Reply via email to