On Mon, Jul 26, 2010 at 10:24:29AM +0200, Damian Pietras wrote: > Hi, I use libxml to do HTML processing using htmlParseDocument, than do > some simple transformations (like replacing URIs just to correct relative > patch etc.) and then save the document using xmlSaveDoc(). The output is > an HTML file that is passed to the web browser. > > The problem is that in case that there is no DOCTYPE declaration in the > input document libxml2 adds a default one: > > <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" > "http://www.w3.org/TR/REC-html40/loose.dtd"> > > There is a difference in rendering of pages by web browsers that comes > from various quirks modes that are turned on or off based on the DOCTYPE > declaration. To illustrate the difference there is a test page where you > can see the same HTML/CSS code with various DOCTYPEs prepended: > > http://dbaron.org/mozilla/tests/compat?doctype= > http://dbaron.org/mozilla/tests/compat?doctype=%3C!DOCTYPE+HTML+PUBLIC+%22-%2F%2FW3C%2F%2FDTD+HTML+4.01+Transitional%2F%2FEN%22+%22http%3A%2F%2Fwww.w3.org%2FTR%2Fhtml4%2Floose.dtd%22%3E > http://dbaron.org/mozilla/tests/compat?doctype=%3C!DOCTYPE+HTML+PUBLIC+%22-%2F%2FW3C%2F%2FDTD+HTML+4.01+Transitional%2F%2FEN%22%3E > http://dbaron.org/mozilla/tests/compat?doctype=%3C!DOCTYPE+HTML%3E > > Although that in the cases I've seen the web page having no DOCTYPE is > rendered like with the DOCTYPE that is prepended by libxml2 I would be > happy if there was a way to not append the default DOCTYPE or to > know that the original document had no DOCTYPE at all. Is there a > way to do that?
Hum, this is added automatically at the end of htmlParseDocument() if no doctype was found, and until now there is no option to turn this off. Since this is an arbitrary behaviour from libxml2, and while this can be fixed (by finding and removing said DTD from the resulting tree), I think it's best to provide a new HTML_PARSE_NODEFDTD parsing option for the HTML parser to avoid this. The code is actually fairly simple, I'm attaching the patch I will commit soon, I'm adding an --nodefdtd option to xmllint to use with --html in order to activate the flag: paphio:~/XML -> xmllint --html --debug tst.html HTML DOCUMENT URL=tst.html standalone=true DTD(html), PUBLIC -//W3C//DTD HTML 4.0 Transitional//EN, SYSTEM http://www.w3.org/TR/REC-html40/loose.dtd ELEMENT html ELEMENT body TEXT content= paphio:~/XML -> xmllint --html --nodefdtd --debug tst.html HTML DOCUMENT URL=tst.html standalone=true ELEMENT html ELEMENT body TEXT content= paphio:~/XML -> thanks for raising the issue, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ dan...@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/
diff --git a/HTMLparser.c b/HTMLparser.c index 42dc776..224c65f 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -4670,7 +4670,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) ctxt->sax->endDocument(ctxt->userData); - if (ctxt->myDoc != NULL) { + if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) { dtd = xmlGetIntSubset(ctxt->myDoc); if (dtd == NULL) ctxt->myDoc->intSubset = @@ -6530,6 +6530,10 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) ctxt->options |= XML_PARSE_HUGE; options -= XML_PARSE_HUGE; } + if (options & HTML_PARSE_NODEFDTD) { + ctxt->options |= HTML_PARSE_NODEFDTD; + options -= HTML_PARSE_NODEFDTD; + } ctxt->dictNames = 0; return (options); } diff --git a/include/libxml/HTMLparser.h b/include/libxml/HTMLparser.h index cde0ac6..fbcc811 100644 --- a/include/libxml/HTMLparser.h +++ b/include/libxml/HTMLparser.h @@ -177,6 +177,7 @@ XMLPUBFUN void XMLCALL */ typedef enum { HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ + HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */ HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ diff --git a/xmllint.c b/xmllint.c index 2a75e3b..88c4a6b 100644 --- a/xmllint.c +++ b/xmllint.c @@ -162,6 +162,9 @@ static int html = 0; static int xmlout = 0; #endif static int htmlout = 0; +#if defined(LIBXML_HTML_ENABLED) +static int nodefdtd = 0; +#endif #ifdef LIBXML_PUSH_ENABLED static int push = 0; #endif /* LIBXML_PUSH_ENABLED */ @@ -2995,6 +2998,7 @@ static void usage(const char *name) { #ifdef LIBXML_HTML_ENABLED printf("\t--html : use the HTML parser\n"); printf("\t--xmlout : force to use the XML serializer when using --html\n"); + printf("\t--nodefdtd : do not default HTML doctype\n"); #endif #ifdef LIBXML_PUSH_ENABLED printf("\t--push : use the push mode of the parser\n"); @@ -3157,6 +3161,10 @@ main(int argc, char **argv) { else if ((!strcmp(argv[i], "-xmlout")) || (!strcmp(argv[i], "--xmlout"))) { xmlout++; + } else if ((!strcmp(argv[i], "-nodefdtd")) || + (!strcmp(argv[i], "--nodefdtd"))) { + nodefdtd++; + options |= HTML_PARSE_NODEFDTD; } #endif /* LIBXML_HTML_ENABLED */ else if ((!strcmp(argv[i], "-loaddtd")) ||
_______________________________________________ xml mailing list, project page http://xmlsoft.org/ xml@gnome.org http://mail.gnome.org/mailman/listinfo/xml