Revision: 4223
Author: [email protected]
Date: Fri Aug 6 13:18:27 2010
Log: Be more tolerant when parsing doctypes. This is Gagan.Goku's doctype
patch from http://codereview.appspot.com/1850049/show reviewed by me.
http://code.google.com/p/google-caja/source/detail?r=4223
Modified:
/trunk/src/com/google/caja/parser/html/DoctypeMaker.java
/trunk/src/com/google/caja/parser/html/DomParser.java
/trunk/tests/com/google/caja/parser/html/DoctypeMakerTest.java
/trunk/tests/com/google/caja/parser/html/DomParserTest.java
=======================================
--- /trunk/src/com/google/caja/parser/html/DoctypeMaker.java Fri Nov 20
16:58:55 2009
+++ /trunk/src/com/google/caja/parser/html/DoctypeMaker.java Fri Aug 6
13:18:27 2010
@@ -61,6 +61,7 @@
static {
// S ::= (#x20 | #x9 | #xD | #xA)+
String s = "[ \\t\\r\\n]+";
+ String sStar = "[ \\t\\r\\n]*";
// NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] |
[#xD8-#xF6]
// | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
// | [#x200C-#x200D] | [#x2070-#x218F] |
[#x2C00-#x2FEF]
@@ -89,8 +90,9 @@
String externalId = (
"(?:SYSTEM" + s + "(" + systemLiteral + ")"
+ "|PUBLIC" + s + "("+ pubidLiteral + ")"
- // XML does not allow the system id to be omitted, but HTML does
- + "(?:" + s + "(" + systemLiteral + "))?)");
+ // XML does not allow the system id to be omitted, but HTML does.
+ // Also, whitespaces between public id and system id can be
omitted.
+ + "(?:" + sStar + "(" + systemLiteral + "))?)");
String intSubset = "[^\\]>]*";
// '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
// Groups: Name 1, SystemLiteral 2 or 4, PubidLiteral 3.
=======================================
--- /trunk/src/com/google/caja/parser/html/DomParser.java Mon Jul 12
17:01:54 2010
+++ /trunk/src/com/google/caja/parser/html/DomParser.java Fri Aug 6
13:18:27 2010
@@ -23,8 +23,8 @@
import com.google.caja.lexer.ParseException;
import com.google.caja.lexer.Token;
import com.google.caja.lexer.TokenQueue;
-import com.google.caja.lexer.TokenStream;
import com.google.caja.lexer.TokenQueue.Mark;
+import com.google.caja.lexer.TokenStream;
import com.google.caja.reporting.Message;
import com.google.caja.reporting.MessagePart;
import com.google.caja.reporting.MessageQueue;
@@ -159,7 +159,7 @@
domImpl = DOMImplementationRegistry.newInstance()
.getDOMImplementation(features);
} catch (ClassNotFoundException ex) {
- throw new SomethingWidgyHappenedError(
+ throw new SomethingWidgyHappenedError(
"Missing DOM implementation. Is Xerces on the classpath?",
ex);
} catch (IllegalAccessException ex) {
throw new SomethingWidgyHappenedError(
@@ -178,7 +178,6 @@
DocumentType doctype = doctypeMaker != null
? doctypeMaker.apply(domImpl) : null;
return domImpl.createDocument(null, null, doctype);
-
}
public static Document makeDocument(
@@ -657,6 +656,7 @@
if (tokens.isEmpty()) { return null; }
Function<DOMImplementation, DocumentType> doctypeMaker = null;
Mark start = tokens.mark();
+
doctypeloop:
while (!tokens.isEmpty()) {
Token<HtmlTokenType> t = tokens.peek();
@@ -665,6 +665,7 @@
case IGNORABLE:
tokens.pop();
break;
+
case DIRECTIVE:
tokens.pop();
final Function<DOMImplementation, DocumentType> maker
@@ -681,7 +682,19 @@
break doctypeloop;
}
break;
- default: break doctypeloop;
+
+ case TEXT:
+ String text = t.text;
+ if (text.trim().equals("")) {
+ // Ignore beginning whitespace.
+ tokens.pop();
+ break;
+ }
+ // Otherwise no doctype.
+ break doctypeloop;
+
+ default:
+ break doctypeloop;
}
}
tokens.rewind(start);
=======================================
--- /trunk/tests/com/google/caja/parser/html/DoctypeMakerTest.java Fri Nov
20 16:58:55 2009
+++ /trunk/tests/com/google/caja/parser/html/DoctypeMakerTest.java Fri Aug
6 13:18:27 2010
@@ -162,6 +162,20 @@
assertDoctype("html", null, null, "<!DOCTYPE HTML>");
assertDoctype("html", null, null, "<!DOCTYPE html>");
+
+ // Test whether omitting whitespace b/w public id and system id allows
+ // doctype to be parsed correctly.
+ assertDoctype("html", "-//W3C//DTD HTML 4.01 Transitional//EN",
+ "http://www.w3.org/TR/html4/loose.dtd",
+ "<!DOCTYPE HTML PUBLIC "
+ + "\"-//W3C//DTD HTML 4.01 Transitional//EN\""
+ + "\"http://www.w3.org/TR/html4/loose.dtd\">");
+
+ assertDoctype("html", "-//W3C//DTD XHTML 1.0 Transitional//EN",
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
+ "<!DOCTYPE html PUBLIC "
+ + "\"-//W3C//DTD XHTML 1.0 Transitional//EN\" "
+
+ "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">");
}
private void assertDoctype(
=======================================
--- /trunk/tests/com/google/caja/parser/html/DomParserTest.java Thu Jul 22
10:23:41 2010
+++ /trunk/tests/com/google/caja/parser/html/DomParserTest.java Fri Aug 6
13:18:27 2010
@@ -42,6 +42,7 @@
import java.util.NoSuchElementException;
import org.w3c.dom.Attr;
+import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
@@ -1991,6 +1992,38 @@
// the behavior of fragments around DOCTYPEs.
true);
}
+
+ public final void testFindDoctypeIgnoresLeadingWhitespace() throws
Exception {
+ String[] htmlInput = {
+ " \t\r\n <!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 1.0
Transitional//EN\"",
+ "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">",
+ "<xmp><br/></xmp>" };
+ TokenQueue<HtmlTokenType> tq = tokenizeTestInput(
+ Join.join("\n", htmlInput), false, true);
+ DomParser parser = new DomParser(tq, false, mq);
+
+ Document doc = parser.parseDocument().getOwnerDocument();
+ assertEquals("-//W3C//DTD HTML 1.0 Transitional//EN",
+ doc.getDoctype().getPublicId());
+ assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
+ doc.getDoctype().getSystemId());
+ assertEquals("html",
+ doc.getDoctype().getName());
+
+ assertParsedMarkup(Arrays.asList(htmlInput),
+ Arrays.asList(
+ "Element : html 4+1-4+17",
+ " Element : head 4+1-4+1",
+ " Element : body 4+1-4+17",
+ " Element : xmp 4+1-4+17",
+ " Text : <br/> 4+6-4+11"
+ ),
+ Arrays.<String>asList(),
+ Arrays.asList(
+ "<html><head></head><body><xmp><br/></xmp></body></html>"
+ ),
+ false, false);
+ }
public final void testDoctypeGuessAsXhtml() throws Exception {
assertParsedMarkup(