Revision: 5144
Author:   [email protected]
Date:     Thu Nov  8 14:46:38 2012
Log:      Replace host innerHTML with our own HTML5-compliant serializer.
http://codereview.appspot.com/6498123

Our previous strategy to provide a innerHTML getter was to invoke the
browser's, then sanitize the output. This strategy cannot implement
foreign node protection, since whether a node is foreign depends on its
object identity, which is not available starting from an innerHTML
string.

Therefore, it is replaced by our own implementation working off the
tame nodes (and therefore unable to provide excess authority over
regular DOM traversal), and operating according to the HTML5
specification's HTML fragment serialization algorithm.

[email protected]

http://code.google.com/p/google-caja/source/detail?r=5144

Modified:
 /trunk/src/com/google/caja/plugin/domado.js
 /trunk/tests/com/google/caja/plugin/JQueryTest.java
 /trunk/tests/com/google/caja/plugin/browser-test-case.js
 /trunk/tests/com/google/caja/plugin/es53-test-domado-dom-guest.html
 /trunk/tests/com/google/caja/plugin/es53-test-domado-foreign-guest.html

=======================================
--- /trunk/src/com/google/caja/plugin/domado.js Thu Nov  8 14:37:26 2012
+++ /trunk/src/com/google/caja/plugin/domado.js Thu Nov  8 14:46:38 2012
@@ -1833,51 +1833,6 @@
             return realValue;
         }
       }
-
-      /**
-       * Undoes some of the changes made by sanitizeHtml, e.g. stripping ID
-       * prefixes.
-       */
-      function tameInnerHtml(htmlText) {
-        var out = [];
-        innerHtmlTamer(htmlText, out);
-        return out.join('');
-      }
-      var innerHtmlTamer = html.makeSaxParser({
-          startTag: function (tagName, attribs, out) {
-            tagName = realToVirtualElementName(tagName);
-            out.push('<', tagName);
-            for (var i = 0; i < attribs.length; i += 2) {
-              var aname = '' + attribs[+i];
-              var atype = htmlSchema.attribute(tagName, aname).type;
-              var value = attribs[i + 1];
-              if (aname !== 'target' && atype !== void 0) {
-                value = virtualizeAttributeValue(atype, value);
-                if (typeof value === 'string') {
- out.push(' ', aname, '="', html.escapeAttrib(value), '"');
-                }
-              } else if (cajaPrefRe.test(aname)) {
-                out.push(' ', aname.substring(cajaPrefix.length), '="',
-                    html.escapeAttrib(value), '"');
-              }
-            }
-            out.push('>');
-          },
-          endTag: function (tagName, out) {
-            var rempty = htmlSchema.element(tagName).empty;
-            tagName = realToVirtualElementName(tagName);
-            var vempty = htmlSchema.element(tagName).empty;
-            if (vempty && !rempty) {
- // omit end tag because the browser doesn't see the virtualized
-              // element as empty
-              return;
-            }
-            out.push('</', tagName, '>');
-          },
-          pcdata: function (text, out) { out.push(text); },
-          rcdata: function (text, out) { out.push(text); },
-          cdata: function (text, out) { out.push(text); }
-        });

       function getSafeTargetAttribute(tagName, attribName, value) {
         if (value !== null) {
@@ -2030,6 +1985,141 @@
             return null;
         }
       }
+
+      // Implementation of HTML5 "HTML fragment serialization algorithm"
+ // per http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#html-fragment-serialization-algorithm
+      // as of 2012-09-11.
+      //
+ // Per HTML5: "Warning! It is possible that the output of this algorithm,
+      // if parsed with an HTML parser, will not return the original tree
+ // structure." Therefore, an innerHTML round-trip on a safe (from Caja's
+      // perspective) but malicious DOM may be able to attack guest code.
+ // TODO(kpreid): Evaluate desirability of prohibiting the worst cases of
+      // this in our DOM mutators.
+      function htmlFragmentSerialization(tameRoot) {
+        tameRoot = TameNodeT.coerce(tameRoot);
+        var sl = [];
+
+        // Note: This algorithm is implemented in terms of tame nodes, not
+ // feral nodes; therefore, it requires no access checks as it yields
+        // only information which clients can obtain by object access.
+        function recur(tameParent) {
+          var nodes = tameParent.childNodes;
+          var nNodes = nodes.length;
+          for (var i = 0; i < nNodes; i++) {
+            var tameCurrent = nodes.item(i);
+            switch (tameCurrent.nodeType) {
+              case 1:  // Element
+                // TODO(kpreid): namespace issues
+                var tagName = tameCurrent.tagName;
+                if (tagName === undefined) {
+                  // foreign node case
+                  continue;
+                }
+                tagName = tagName.toLowerCase();
+                    // TODO(kpreid): not conformant
+                sl.push('<', tagName);
+                var attrs = tameCurrent.attributes;
+                var nAttrs = attrs.length;
+                for (var j = 0; j < nAttrs; j++) {
+                  var attr = attrs.item(j);
+                  var aName = attr.name;
+                  if (aName === 'target') {
+                    // hide Caja-added link target attributes
+ // TODO(kpreid): Shouldn't these be hidden in the attributes + // list? This special case (and the one below) is emulating
+                    // tested-for behavior in a previous .innerHTML
+                    // implementation, not written from first principles.
+                    continue;
+                  }
+                  var aValue = attr.value;
+                  if (aValue === null) {
+                    // rejected by virtualizeAttributeValue
+ // TODO(kpreid): Shouldn't these be hidden in the attributes
+                    // list?
+                    continue;
+                  }
+                  // TODO(kpreid): check escapeAttrib conformance
+ sl.push(' ', attr.name, '="', html.escapeAttrib(aValue), '"');
+                }
+                sl.push('>');
+                switch (tagName) {
+                  case 'area':
+                  case 'base':
+                  case 'basefont':
+                  case 'bgsound':
+                  case 'br':
+                  case 'col':
+                  case 'command':
+                  case 'embed':
+                  case 'frame':
+                  case 'hr':
+                  case 'img':
+                  case 'input':
+                  case 'keygen':
+                  case 'link':
+                  case 'meta':
+                  case 'param':
+                  case 'source':
+                  case 'track':
+                  case 'wbr':
+                    // do nothing
+                    break;
+                  case 'pre':
+                  case 'textarea':
+                  case 'listing':
+                    if (tameCurrent.firstChild &&
+                        tameCurrent.firstChild.nodeType === 3 &&
+                        tameCurrent.firstChild.data[0] === '\n') {
+                      sl.push('\n');
+                    }
+                    // fallthrough
+                  default:
+                    recur(tameCurrent);
+                    sl.push('</', tagName, '>');
+                }
+                break;
+              case 3:  // Text
+                switch (tameCurrent.parentNode.tagName.toLowerCase()) {
+                    // TODO(kpreid): namespace
+                  case 'style':
+                  case 'script':
+                  case 'xmp':
+                  case 'iframe':
+                  case 'noembed':
+                  case 'noframes':
+                  case 'plaintext':
+                  case 'noscript':
+                    sl.push(tameCurrent.data);
+                    break;
+                  default:
+                    sl.push(html.escapeAttrib(tameCurrent.data));
+                    break;
+                }
+                break;
+              case 8:  // Comment
+                sl.push('<', '!--', tameCurrent.data, '-->');
+                break;
+              case 7:  // ProcessingInstruction
+ sl.push('<?', tameCurrent.target, ' ', tameCurrent.data, '>');
+                break;
+              case 10:  // DocumentType
+                sl.push('<', '!DOCTYPE ', tameCurrent.name, '>');
+                break;
+              default:
+                if (typeof console !== 'undefined') {
+ console.error('Domado internal: HTML fragment serialization '
+                      + 'algorithm met unexpected node type '
+                      + tameCurrent.nodeType);
+                }
+                break;
+            }
+          }
+        }
+        recur(tameRoot);
+
+        return sl.join('');
+      }

       // Property descriptors which are independent of any feral object.
       /**
@@ -3646,28 +3736,7 @@
         innerHTML: {
           enumerable: true,
           get: nodeMethod(function () {
-            var node = np(this).feral;
-            var tagName = node.tagName.toLowerCase();
-            var schemaElem = htmlSchema.element(tagName);
-            if (!schemaElem.allowed) {
-              return '';  // unknown node
-            }
-            var innerHtml = node.innerHTML;
-            if (schemaElem.contentIsCDATA) {
-              innerHtml = html.escapeAttrib(innerHtml);
-            } else if (schemaElem.contentIsRCDATA) {
-              // Make sure we return PCDATA.
- // For RCDATA we only need to escape & if they're not part of an
-              // entity.
-              innerHtml = html.normalizeRCData(innerHtml);
-            } else {
- // If we blessed the resulting HTML, then this would round trip - // better but it would still not survive appending, and it would - // propagate event handlers where the setter of innerHTML does not
-              // expect it to.
-              innerHtml = tameInnerHtml(innerHtml);
-            }
-            return innerHtml;
+            return htmlFragmentSerialization(this);
           }),
           set: nodeMethod(function (htmlFragment) {
             // This operation changes the child node list (but not other
=======================================
--- /trunk/tests/com/google/caja/plugin/JQueryTest.java Fri Nov 2 15:14:28 2012 +++ /trunk/tests/com/google/caja/plugin/JQueryTest.java Thu Nov 8 14:46:38 2012
@@ -106,7 +106,7 @@
   }

   public final void testManipulation() throws Exception {
-    runQUnitTestCase("manipulation", 484);
+    runQUnitTestCase("manipulation", 474);
     // Current modifications made to test suite:
// * Removed SES-incompatible Array.prototype modification; was only for
     //     testing jQuery robustness.
@@ -120,6 +120,7 @@
     //   * We don't implement some case of dynamic <script> creation that
// "html() - execute scripts..." and "html() - script exceptions..."
     //     are using.
+    //   * "window.eval is undefined" in appendTo test -- REGRESSION
   }

   public final void testCSS() throws Exception {
=======================================
--- /trunk/tests/com/google/caja/plugin/browser-test-case.js Thu Nov 8 14:37:26 2012 +++ /trunk/tests/com/google/caja/plugin/browser-test-case.js Thu Nov 8 14:46:38 2012
@@ -425,6 +425,11 @@
     getAttribute: function (tameNode, name) {
       return frame.domicile.feralNode(tameNode).getAttribute(name);
     },
+    getParentNode: function(tameNode) {
+      // escapes foreign node/outside-of-vdoc protection
+      return frame.domicile.tameNode(
+          frame.domicile.feralNode(tameNode).parentNode);
+    },
     getBodyNode: function () {
       return frame.domicile.tameNode(frame.innerContainer);
     },
@@ -461,6 +466,7 @@
   makeCallable(directAccess.emitCssHook);
   makeCallable(directAccess.getInnerHTML);
   makeCallable(directAccess.getAttribute);
+  makeCallable(directAccess.getParentNode);
   makeCallable(directAccess.getBodyNode);
   makeCallable(directAccess.getComputedStyle);
   makeCallable(directAccess.makeUnattachedScriptNode);
=======================================
--- /trunk/tests/com/google/caja/plugin/es53-test-domado-dom-guest.html Thu Nov 8 14:37:26 2012 +++ /trunk/tests/com/google/caja/plugin/es53-test-domado-dom-guest.html Thu Nov 8 14:46:38 2012
@@ -1199,6 +1199,49 @@
   });
 </script>

+<form>
+  <p id="testGetInnerHTML" class="testcontainer">
+    <a href="foo">bar</a>
+    <span>Hello world</span>
+  </p>
+</form>
+<!-- Test the operation of our implementation of the innerHTML getter
+      (aka HTML fragment serializer) -->
+<script type="text/javascript">
+  jsunitRegister('testGetInnerHTML',
+                 function testGetInnerHTML() {
+ // Note: The rewritten URI appearing here is not a desirable thing, but we
+    // simply don't support preserving original URIs in general.
+    // What is being tested for here is:
+    //    * general correctness of innerHTML output
+    //    * that target='_blank' does not appear
+ // TODO(kpreid): said funny whitespace probably ought to be canceled out
+    // by some other whitespace processing we're not implementing.
+    assertEquals(
+        '\n' +
+ ' <a href="http://example.com/?effect=NEW_DOCUMENT&amp;loader=' + + 'UNSANDBOXED&amp;uri=http%3A%2F%2Flocalhost%3A8000%2Fant-testlib%2F' +
+        'com%2Fgoogle%2Fcaja%2Fplugin%2Ffoo">bar</a>\n' +
+        '    <span>Hello world</span>\n' +
+        '  ',
+        document.getElementById('testGetInnerHTML').innerHTML);
+
+    // textarea with specific children, per HTML5
+    var container = document.createElement('form');
+    container.appendChild(document.createElement('textarea'));
+    // none
+    assertEquals("<textarea></textarea>", container.innerHTML);
+    // arbitrary text
+    container.firstChild.textContent = 'x';
+    assertEquals('<textarea>x</textarea>', container.innerHTML);
+    // arbitrary text starting w/ newline
+    container.firstChild.textContent = '\nx';
+    assertEquals('<textarea>\n\nx</textarea>', container.innerHTML);
+
+    pass('testGetInnerHTML');
+  });
+</script>
+
 <p id="testInnerHTMLStyleSanitizer" class="testcontainer"></p>
 <script type="text/javascript">
   jsunitRegister('testInnerHTMLStyleSanitizer',
=======================================
--- /trunk/tests/com/google/caja/plugin/es53-test-domado-foreign-guest.html Thu Nov 8 14:37:26 2012 +++ /trunk/tests/com/google/caja/plugin/es53-test-domado-foreign-guest.html Thu Nov 8 14:46:38 2012
@@ -60,7 +60,7 @@
     }
   }

-  function testForeign(name, node) {
+  function testForeign(name, node, expectParent) {
     name = name + ': ';

     // Walk all of DOM interface Node to check for proper encapsulation
@@ -131,6 +131,16 @@
     assertBottom(name + 'lastChild', node.lastChild);
     assertBottom(name + 'previousSibling', node.previousSibling);
     assertBottom(name + 'nextSibling', node.nextSibling);
+
+    assertBottom(name + 'innerHTML', node.innerHTML);
+    assertBottom(name + 'textContent', node.textContent);
+    if (expectParent) {
+      var parent = directAccess.getParentNode(node);
+      assertTrue(name + 'innerHTML sanity check',
+          directAccess.getInnerHTML(node).indexOf('Div number') > 0);
+      assertFalse(name + 'containing innerHTML',
+          parent.innerHTML.indexOf('Div number') > 0);
+    }

     assertTrue(name + 'getElementsByTagName empty',
         node.getElementsByTagName('div').length === 0);
@@ -155,7 +165,7 @@
 <script type="text/javascript">
   jsunitRegister('testExternalForeign',
                  function testExternalForeign() {
-    testForeign('testExternalForeign', getExternalForeignNode());
+    testForeign('testExternalForeign', getExternalForeignNode(), false);
     pass('testExternalForeign');
   });
 </script>
@@ -183,7 +193,7 @@

   jsunitRegister('testEmbeddedForeign',
                  function testEmbeddedForeign() {
-    testForeign('embedded-foreign', getEmbeddedForeignNode());
+    testForeign('testEmbeddedForeign', getEmbeddedForeignNode(), true);

     // Test that getElementsBy* skip foreign children.
     // goodNode1 is at the beginning of the list in document order and

Reply via email to