Revision: 4223
Author: [email protected]
Date: Fri Aug  6 13:18:27 2010
Log: Be more tolerant when parsing doctypes. This is Gagan.Goku's doctype patch from http://codereview.appspot.com/1850049/show reviewed by me.
http://code.google.com/p/google-caja/source/detail?r=4223

Modified:
 /trunk/src/com/google/caja/parser/html/DoctypeMaker.java
 /trunk/src/com/google/caja/parser/html/DomParser.java
 /trunk/tests/com/google/caja/parser/html/DoctypeMakerTest.java
 /trunk/tests/com/google/caja/parser/html/DomParserTest.java

=======================================
--- /trunk/src/com/google/caja/parser/html/DoctypeMaker.java Fri Nov 20 16:58:55 2009 +++ /trunk/src/com/google/caja/parser/html/DoctypeMaker.java Fri Aug 6 13:18:27 2010
@@ -61,6 +61,7 @@
   static {
     // S             ::=  (#x20 | #x9 | #xD | #xA)+
     String s = "[ \\t\\r\\n]+";
+    String sStar = "[ \\t\\r\\n]*";
// NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6]
     //                 |  [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
// | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF]
@@ -89,8 +90,9 @@
     String externalId = (
         "(?:SYSTEM" + s + "(" + systemLiteral + ")"
         + "|PUBLIC" + s + "("+ pubidLiteral + ")"
-        // XML does not allow the system id to be omitted, but HTML does
-        + "(?:" + s + "(" + systemLiteral + "))?)");
+        // XML does not allow the system id to be omitted, but HTML does.
+ // Also, whitespaces between public id and system id can be omitted.
+        + "(?:" + sStar + "(" + systemLiteral + "))?)");
     String intSubset = "[^\\]>]*";
     // '<!DOCTYPE' S  Name (S  ExternalID)? S? ('[' intSubset ']' S?)? '>'
     // Groups: Name 1, SystemLiteral 2 or 4, PubidLiteral 3.
=======================================
--- /trunk/src/com/google/caja/parser/html/DomParser.java Mon Jul 12 17:01:54 2010 +++ /trunk/src/com/google/caja/parser/html/DomParser.java Fri Aug 6 13:18:27 2010
@@ -23,8 +23,8 @@
 import com.google.caja.lexer.ParseException;
 import com.google.caja.lexer.Token;
 import com.google.caja.lexer.TokenQueue;
-import com.google.caja.lexer.TokenStream;
 import com.google.caja.lexer.TokenQueue.Mark;
+import com.google.caja.lexer.TokenStream;
 import com.google.caja.reporting.Message;
 import com.google.caja.reporting.MessagePart;
 import com.google.caja.reporting.MessageQueue;
@@ -159,7 +159,7 @@
         domImpl = DOMImplementationRegistry.newInstance()
             .getDOMImplementation(features);
       } catch (ClassNotFoundException ex) {
-         throw new SomethingWidgyHappenedError(
+        throw new SomethingWidgyHappenedError(
"Missing DOM implementation. Is Xerces on the classpath?", ex);
       } catch (IllegalAccessException ex) {
         throw new SomethingWidgyHappenedError(
@@ -178,7 +178,6 @@
     DocumentType doctype = doctypeMaker != null
         ? doctypeMaker.apply(domImpl) : null;
     return domImpl.createDocument(null, null, doctype);
-
   }

   public static Document makeDocument(
@@ -657,6 +656,7 @@
     if (tokens.isEmpty()) { return null; }
     Function<DOMImplementation, DocumentType> doctypeMaker = null;
     Mark start = tokens.mark();
+
     doctypeloop:
     while (!tokens.isEmpty()) {
       Token<HtmlTokenType> t = tokens.peek();
@@ -665,6 +665,7 @@
         case IGNORABLE:
           tokens.pop();
           break;
+
         case DIRECTIVE:
           tokens.pop();
           final Function<DOMImplementation, DocumentType> maker
@@ -681,7 +682,19 @@
             break doctypeloop;
           }
           break;
-        default: break doctypeloop;
+
+        case TEXT:
+          String text = t.text;
+          if (text.trim().equals("")) {
+            // Ignore beginning whitespace.
+            tokens.pop();
+            break;
+          }
+          // Otherwise no doctype.
+          break doctypeloop;
+
+        default:
+          break doctypeloop;
       }
     }
     tokens.rewind(start);
=======================================
--- /trunk/tests/com/google/caja/parser/html/DoctypeMakerTest.java Fri Nov 20 16:58:55 2009 +++ /trunk/tests/com/google/caja/parser/html/DoctypeMakerTest.java Fri Aug 6 13:18:27 2010
@@ -162,6 +162,20 @@
     assertDoctype("html", null, null, "<!DOCTYPE HTML>");

     assertDoctype("html", null, null, "<!DOCTYPE html>");
+
+    // Test whether omitting whitespace b/w public id and system id allows
+    // doctype to be parsed correctly.
+    assertDoctype("html", "-//W3C//DTD HTML 4.01 Transitional//EN",
+                  "http://www.w3.org/TR/html4/loose.dtd";,
+                  "<!DOCTYPE HTML PUBLIC "
+                  + "\"-//W3C//DTD HTML 4.01 Transitional//EN\""
+                  + "\"http://www.w3.org/TR/html4/loose.dtd\";>");
+
+    assertDoctype("html", "-//W3C//DTD XHTML 1.0 Transitional//EN",
+                  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";,
+                  "<!DOCTYPE html PUBLIC "
+                  + "\"-//W3C//DTD XHTML 1.0 Transitional//EN\" "
+ + "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\";>");
 }

   private void assertDoctype(
=======================================
--- /trunk/tests/com/google/caja/parser/html/DomParserTest.java Thu Jul 22 10:23:41 2010 +++ /trunk/tests/com/google/caja/parser/html/DomParserTest.java Fri Aug 6 13:18:27 2010
@@ -42,6 +42,7 @@
 import java.util.NoSuchElementException;

 import org.w3c.dom.Attr;
+import org.w3c.dom.Document;
 import org.w3c.dom.DocumentFragment;
 import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
@@ -1991,6 +1992,38 @@
         // the behavior of fragments around DOCTYPEs.
         true);
   }
+
+ public final void testFindDoctypeIgnoresLeadingWhitespace() throws Exception {
+    String[] htmlInput = {
+ " \t\r\n <!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 1.0 Transitional//EN\"",
+        "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\";>",
+        "<xmp><br/></xmp>" };
+    TokenQueue<HtmlTokenType> tq = tokenizeTestInput(
+        Join.join("\n", htmlInput), false, true);
+    DomParser parser = new DomParser(tq, false, mq);
+
+    Document doc = parser.parseDocument().getOwnerDocument();
+    assertEquals("-//W3C//DTD HTML 1.0 Transitional//EN",
+                 doc.getDoctype().getPublicId());
+    assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";,
+                 doc.getDoctype().getSystemId());
+    assertEquals("html",
+                 doc.getDoctype().getName());
+
+    assertParsedMarkup(Arrays.asList(htmlInput),
+        Arrays.asList(
+            "Element : html 4+1-4+17",
+            "  Element : head 4+1-4+1",
+            "  Element : body 4+1-4+17",
+            "    Element : xmp 4+1-4+17",
+            "      Text : <br/> 4+6-4+11"
+            ),
+        Arrays.<String>asList(),
+        Arrays.asList(
+            "<html><head></head><body><xmp><br/></xmp></body></html>"
+            ),
+        false, false);
+  }

   public final void testDoctypeGuessAsXhtml() throws Exception {
     assertParsedMarkup(

Reply via email to