Daniel Veillard wrote:
>>A few months ago, I came across a "bug" where whitespace nodes as a
>>direct child of the <body> tag would be removed. The problem is similar
>>in that pure whitespace nodes are forbidden by the strict DTD, but
>>allowed by the transitional DTD.
>>
>>In this case, the applied patch checked the DTD in use with code like
>>
>>dtd = xmlGetIntSubset(ctxt->myDoc);
>>if (dtd != NULL && dtd->ExternalID != NULL) {
>> if (!xmlStrcasecmp(dtd->ExternalID,
>> BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
>> !xmlStrcasecmp(dtd->ExternalID,
>> BAD_CAST "-//W3C//DTD HTML 4//EN"))
>>{
>>(line 2060, HTMLparser.c).
>>
>>This code assumes that HTML 4 and HTML 4.01 are the only strict non-XML
>>DTDs in existence.
>>
>>Something similar might be useful for this issue - the <p> tags are not
>>needed for a Transitional DTD. I'll have a look to see if there's an
>>easy fix at the weekend, if nobody's supplied a patch before that :-)
>
>
> Yes, thanks ! That sounds the right approach to me, I would just turn
> merge that with a new htmlParserOption HTML_PARSE_STRICT, which could be
> either passed by the user to maintain the current behaviour or activated by
> default when the DOCTYPE is read if it happen to be a Strict HTML one.
>
> make sense ? Actually, I got around to testing this today, and it looks it was fixed by the above checkin - version 1.196 of HTMLparser.c. iSteve, can you check the behaviour with the latest release, or from CVS? Daniel, this behaviour was not the same after the patch - htmlNoContentElements[] was altered, so the <p> tag doesn't get added for either the strict or non-strict DTD. Attached is a patch which does restores the original behaviour for strict DTDs. I'll leave it up to your judgement as to whether you want it back. There is no support for HTML_PARSE_STRICT, do you still think it might be needed? I don't like the cast to xmlChar** in the patch - but I couldn't find a pointer definition for 'elements' which wouldn't give a type mismatch warning... Gary.
Index: HTMLparser.c
===================================================================
RCS file: /cvs/gnome/libxml2/HTMLparser.c,v
retrieving revision 1.199
diff -c -r1.199 HTMLparser.c
*** HTMLparser.c 10 Dec 2005 11:11:11 -0000 1.199
--- HTMLparser.c 12 Jan 2006 13:36:54 -0000
***************
*** 968,973 ****
--- 968,985 ----
};
/*
+ * The list of HTML elements which are supposed not to have
+ * CDATA content in a Strict DTD and where a p element will
+ * be implied
+ */
+ static const char *htmlNoContentElementsStrict[] = {
+ "html",
+ "head",
+ "body",
+ NULL
+ };
+
+ /*
* The list of HTML attributes which are of content %Script;
* NOTE: when adding ones, check htmlIsScriptAttribute() since
* it assumes the name starts with 'on'
***************
*** 1132,1137 ****
--- 1144,1171 ----
}
/**
+ * htmlHasStrictDtd:
+ * @ctxt: a HTML parser context
+ *
+ * Does this document have a strict DTD?
+ *
+ * Returns 1 if the document has a strict HTML DTD, 0 otherwise.
+ */
+
+ static int htmlHasStrictDtd(htmlParserCtxtPtr ctxt) {
+ xmlDtdPtr dtd;
+
+ dtd = xmlGetIntSubset(ctxt->myDoc);
+ if (dtd != NULL && dtd->ExternalID != NULL) {
+ if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML
4.01//EN") ||
+ !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML
4//EN"))
+ return(1);
+ }
+
+ return(0);
+ }
+
+ /**
* htmlAutoCloseOnClose:
* @ctxt: an HTML parser context
* @newtag: The new tag name
***************
*** 1353,1358 ****
--- 1387,1393 ----
htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
const xmlChar *tag;
int i;
+ xmlChar **elements;
if (ctxt == NULL)
return(-1);
***************
*** 1367,1374 ****
}
if (!htmlOmittedDefaultValue)
return(0);
! for (i = 0; htmlNoContentElements[i] != NULL; i++) {
! if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
htmlAutoClose(ctxt, BAD_CAST"p");
htmlCheckImplied(ctxt, BAD_CAST"p");
htmlnamePush(ctxt, BAD_CAST"p");
--- 1402,1413 ----
}
if (!htmlOmittedDefaultValue)
return(0);
! if (htmlHasStrictDtd(ctxt))
! elements = (xmlChar **)htmlNoContentElementsStrict;
! else
! elements = (xmlChar **)htmlNoContentElements;
! for (i = 0; elements[i] != NULL; i++) {
! if (xmlStrEqual(tag, BAD_CAST elements[i])) {
htmlAutoClose(ctxt, BAD_CAST"p");
htmlCheckImplied(ctxt, BAD_CAST"p");
htmlnamePush(ctxt, BAD_CAST"p");
***************
*** 2041,2047 ****
unsigned int i;
int j;
xmlNodePtr lastChild;
- xmlDtdPtr dtd;
for (j = 0;j < len;j++)
if (!(IS_BLANK_CH(str[j]))) return(0);
--- 2080,2085 ----
***************
*** 2057,2070 ****
/* Only strip CDATA children of the body tag for strict HTML DTDs */
if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
! dtd = xmlGetIntSubset(ctxt->myDoc);
! if (dtd != NULL && dtd->ExternalID != NULL) {
! if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML
4.01//EN") ||
! !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD
HTML 4//EN"))
! return(1);
! }
}
-
if (ctxt->node == NULL) return(0);
lastChild = xmlGetLastChild(ctxt->node);
while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
--- 2095,2103 ----
/* Only strip CDATA children of the body tag for strict HTML DTDs */
if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
! if (htmlHasStrictDtd(ctxt))
! return(1);
}
if (ctxt->node == NULL) return(0);
lastChild = xmlGetLastChild(ctxt->node);
while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
signature.asc
Description: OpenPGP digital signature
_______________________________________________ xml mailing list, project page http://xmlsoft.org/ [email protected] http://mail.gnome.org/mailman/listinfo/xml
