Re: [xml] [PATCH] less-than character and HTML parser module

Christian Schoenebeck Sat, 25 Apr 2015 19:23:00 -0700

On Thursday 16 April 2015 13:59:28 Christian Schoenebeck wrote:
> On Thursday 16 April 2015 10:32:32 you wrote:
> > > There you go; you find the updated patch attached. It now requires
> > > HTML_PARSE_RECOVER option to be set for recovering from stand-alone
> > > less-than characters.
> > 
> > That sounds fine *except* it doesn't raise an error.
> > The parser knows it's a broken construct that must be pointed out.
> 
> Ok, I see what I can do about that. ;)


Even two patches this time. The 1st patch (libxml-less-than-char_v3.patch) 
just addresses that last minor issue you came up with regarding missing error 
messages. However you probably might skip that patch and rather look at the 
second patch instead.

> >  It sounds a bit weird to handle that error case as one of the main
> >  content
> > 
> > cases, I would still be tempted to go into htmlParseStartTag, get the
> > error reported, but push corrective data instead in recover mode.
> 
> My initial thought solution was to enter htmlParseElement() like before,
> and in case htmlParseElement() encounters an error, it would handle the
> chunk as text instead (if recover option is on). That would probably come
> to the closest what most browsers seem to do. But the problem: that would

The 2nd patch (libxml-invalid-tag-as-text.patch) uses that more general way to 
resolve this overall issue. That is, instead of looking at the content and 
trying to guess ahead whether a less than character will yield in a valid tag, 
this 2nd patch rather uses the regular element parse code, and if it fails to 
parse the tag start it returns a special return value which will cause the 
next input to be consumed as text instead. Most notably this solution has the 
advantage, that many more misfit cases will be consumed as text instead (if 
recovery option is on). For example this 2nd patch also allows to consume 
this:

        a << b

The 1st patch would still have failed in this case.

Please review this 2nd patch carefully though. Because that patch is rewinding 
the parser input, and since I am not very familiar with the libxm2 internals, 
I am not sure whether my rewinding code is a) safe and b) if it does actually 
work with all kinds of input stream types supported by the libxml2 API.

Best regards,
Christian Schoenebeck

diff -r -u libxml2-2.9.1+dfsg1.orig/HTMLparser.c libxml2-2.9.1+dfsg1.SIMPLE_PATCH/HTMLparser.c
--- libxml2-2.9.1+dfsg1.orig/HTMLparser.c	2015-04-14 13:05:01.000000000 +0200
+++ libxml2-2.9.1+dfsg1.SIMPLE_PATCH/HTMLparser.c	2015-04-25 14:29:13.472858931 +0200
@@ -2948,8 +2948,10 @@
 
 
 /**
- * htmlParseCharData:
+ * htmlParseCharDataInternal:
  * @ctxt:  an HTML parser context
+ * @prep:  optional character to be prepended to text, 0 if no character
+ *         shall be prepended
  *
  * parse a CharData section.
  * if we are within a CDATA section ']]>' marks an end of section.
@@ -2958,12 +2960,15 @@
  */
 
 static void
-htmlParseCharData(htmlParserCtxtPtr ctxt) {
-    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
+htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, char prep) {
+    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
     int nbchar = 0;
     int cur, l;
     int chunk = 0;
 
+    if (prep)
+	buf[nbchar++] = prep;
+
     SHRINK;
     cur = CUR_CHAR(l);
     while (((cur != '<') || (ctxt->token == '<')) &&
@@ -3043,6 +3048,21 @@
 }
 
 /**
+ * htmlParseCharData:
+ * @ctxt:  an HTML parser context
+ *
+ * parse a CharData section.
+ * if we are within a CDATA section ']]>' marks an end of section.
+ *
+ * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+ */
+
+static void
+htmlParseCharData(htmlParserCtxtPtr ctxt) {
+    htmlParseCharDataInternal(ctxt, 0);
+}
+
+/**
  * htmlParseExternalID:
  * @ctxt:  an HTML parser context
  * @publicID:  a xmlChar** receiving PubidLiteral
@@ -4157,14 +4177,27 @@
 	    }
 
 	    /*
-	     * Third case :  a sub-element.
+	     * Third case : (unescaped) stand-alone less-than character.
+	     *              Only if HTML_PARSE_RECOVER option is set.
+	     */
+	    else if (ctxt->recovery && (CUR == '<') &&
+		     (IS_BLANK_CH(NXT(1)) || (NXT(1) == '='))) {
+		htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
+			 "htmlParseContent: invalid element name\n",
+			 NULL, NULL);
+		NEXT;
+		htmlParseCharDataInternal(ctxt, '<');
+	    }
+
+	    /*
+	     * Fourth case :  a sub-element.
 	     */
 	    else if (CUR == '<') {
 		htmlParseElement(ctxt);
 	    }
 
 	    /*
-	     * Fourth case : a reference. If if has not been resolved,
+	     * Fifth case : a reference. If if has not been resolved,
 	     *    parsing returns it's Name, create the node
 	     */
 	    else if (CUR == '&') {
@@ -4172,7 +4205,7 @@
 	    }
 
 	    /*
-	     * Fifth case : end of the resource
+	     * Sixth case : end of the resource
 	     */
 	    else if (CUR == 0) {
 		htmlAutoCloseOnEnd(ctxt);
@@ -4567,7 +4600,20 @@
 	    }
 
 	    /*
-	     * Third case :  a sub-element.
+	     * Third case : (unescaped) stand-alone less-than character.
+	     *              Only if HTML_PARSE_RECOVER option is set.
+	     */
+	    else if (ctxt->recovery && (CUR == '<') &&
+		     (IS_BLANK_CH(NXT(1)) || (NXT(1) == '='))) {
+		htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
+			 "htmlParseContent: invalid element name\n",
+			 NULL, NULL);
+		NEXT;
+		htmlParseCharDataInternal(ctxt, '<');
+	    }
+
+	    /*
+	     * Fourth case :  a sub-element.
 	     */
 	    else if (CUR == '<') {
 		htmlParseElementInternal(ctxt);
@@ -4578,7 +4624,7 @@
 	    }
 
 	    /*
-	     * Fourth case : a reference. If if has not been resolved,
+	     * Fifth case : a reference. If if has not been resolved,
 	     *    parsing returns it's Name, create the node
 	     */
 	    else if (CUR == '&') {
@@ -4586,7 +4632,7 @@
 	    }
 
 	    /*
-	     * Fifth case : end of the resource
+	     * Sixth case : end of the resource
 	     */
 	    else if (CUR == 0) {
 		htmlAutoCloseOnEnd(ctxt);

diff -r -u libxml2-2.9.1+dfsg1.orig/HTMLparser.c libxml2-2.9.1+dfsg1/HTMLparser.c
--- libxml2-2.9.1+dfsg1.orig/HTMLparser.c	2015-04-14 13:05:01.000000000 +0200
+++ libxml2-2.9.1+dfsg1/HTMLparser.c	2015-04-26 03:18:31.088399169 +0200
@@ -59,6 +59,7 @@
 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
 			     xmlChar end, xmlChar  end2, xmlChar end3);
 static void htmlParseComment(htmlParserCtxtPtr ctxt);
+static int htmlParseElementRecursive(htmlParserCtxtPtr ctxt);
 
 /************************************************************************
  *									*
@@ -2946,10 +2947,11 @@
     }
 }
 
-
 /**
- * htmlParseCharData:
+ * htmlParseCharDataInternal:
  * @ctxt:  an HTML parser context
+ * @prep:  optional character to be prepended to text, 0 if no character
+ *         shall be prepended
  *
  * parse a CharData section.
  * if we are within a CDATA section ']]>' marks an end of section.
@@ -2958,12 +2960,15 @@
  */
 
 static void
-htmlParseCharData(htmlParserCtxtPtr ctxt) {
-    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
+htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, char prep) {
+    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
     int nbchar = 0;
     int cur, l;
     int chunk = 0;
 
+    if (prep)
+	buf[nbchar++] = prep;
+
     SHRINK;
     cur = CUR_CHAR(l);
     while (((cur != '<') || (ctxt->token == '<')) &&
@@ -3043,6 +3048,21 @@
 }
 
 /**
+ * htmlParseCharData:
+ * @ctxt:  an HTML parser context
+ *
+ * parse a CharData section.
+ * if we are within a CDATA section ']]>' marks an end of section.
+ *
+ * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+ */
+
+static void
+htmlParseCharData(htmlParserCtxtPtr ctxt) {
+    htmlParseCharDataInternal(ctxt, 0);
+}
+
+/**
  * htmlParseExternalID:
  * @ctxt:  an HTML parser context
  * @publicID:  a xmlChar** receiving PubidLiteral
@@ -4051,7 +4071,7 @@
  * htmlParseContent:
  * @ctxt:  an HTML parser context
  *
- * Parse a content: comment, sub-element, reference or text.
+ * Parse a content recursively: comment, sub-element, reference or text.
  * Kept for compatibility with old code
  */
 
@@ -4060,6 +4080,7 @@
     xmlChar *currentNode;
     int depth;
     const xmlChar *name;
+    int res;
 
     currentNode = xmlStrdup(ctxt->name);
     depth = ctxt->nameNr;
@@ -4157,10 +4178,19 @@
 	    }
 
 	    /*
-	     * Third case :  a sub-element.
+	     * Third case :  a sub-element (recursively).
 	     */
 	    else if (CUR == '<') {
-		htmlParseElement(ctxt);
+		res = htmlParseElementRecursive(ctxt);
+
+		/*
+		 * If it was an invalid element tag and parser position was
+		 * rewinded to tag start, then consume next input as text.
+		 */
+		if (res == -2) {
+		    NEXT;
+		    htmlParseCharDataInternal(ctxt, '<');
+		}
 	    }
 
 	    /*
@@ -4201,19 +4231,24 @@
 }
 
 /**
- * htmlParseElement:
+ * htmlParseElementRecursive:
  * @ctxt:  an HTML parser context
  *
  * parse an HTML element, this is highly recursive
  * this is kept for compatibility with previous code versions
+ * regarding htmlParseElement()
  *
  * [39] element ::= EmptyElemTag | STag content ETag
  *
  * [41] Attribute ::= Name Eq AttValue
+ * 
+ * Returns 0: success,
+ *        -1: error - consumed tag input (that is dumped invalid tag),
+ *        -2: error - rewinded input to tag start - handle next char as text
  */
 
-void
-htmlParseElement(htmlParserCtxtPtr ctxt) {
+static int
+htmlParseElementRecursive(htmlParserCtxtPtr ctxt) {
     const xmlChar *name;
     xmlChar *currentNode = NULL;
     const htmlElemDesc * info;
@@ -4221,15 +4256,16 @@
     int failed;
     int depth;
     const xmlChar *oldptr;
+    xmlParserInputPtr startPos = NULL;
 
     if ((ctxt == NULL) || (ctxt->input == NULL)) {
 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
-		     "htmlParseElement: context error\n", NULL, NULL);
-	return;
+		     "htmlParseElementRecursive: context error\n", NULL, NULL);
+	return -1;
     }
 
     if (ctxt->instate == XML_PARSER_EOF)
-        return;
+        return -1;
 
     /* Capture start position */
     if (ctxt->record_info) {
@@ -4237,13 +4273,18 @@
                           (CUR_PTR - ctxt->input->base);
 	node_info.begin_line = ctxt->input->line;
     }
+    startPos = xmlNewInputStream(ctxt);
+    if (startPos != NULL)
+       *startPos = *ctxt->input;
+    else
+       return -1;
 
     failed = htmlParseStartTag(ctxt);
     name = ctxt->name;
     if ((failed == -1) || (name == NULL)) {
 	if (CUR == '>')
 	    NEXT;
-        return;
+        goto errorDoRewind;
     }
 
     /*
@@ -4263,7 +4304,7 @@
 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
 	    ctxt->sax->endElement(ctxt->userData, name);
 	htmlnamePop(ctxt);
-	return;
+	goto errorDoConsume;
     }
 
     if (CUR == '>') {
@@ -4290,7 +4331,7 @@
 	   node_info.node = ctxt->node;
 	   xmlParserAddNodeInfo(ctxt, &node_info);
 	}
-	return;
+	goto errorDoConsume;
     }
 
     /*
@@ -4300,7 +4341,7 @@
 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
 	    ctxt->sax->endElement(ctxt->userData, name);
 	htmlnamePop(ctxt);
-	return;
+	goto errorDoConsume;
     }
 
     /*
@@ -4331,6 +4372,52 @@
 
     if (currentNode != NULL)
 	xmlFree(currentNode);
+
+    /* Success */
+    failed = 0;
+    goto done;
+
+    /* Error occurred, leave input consumed. */
+errorDoConsume:
+    failed = -1;
+    goto done;
+
+    /* 
+     * Error occurred, rewind parser to tag start position.
+     * (but only if HTML_PARSE_RECOVER is set, otherwise leave input consumed)
+     */
+errorDoRewind:
+    if (ctxt->recovery) {
+	failed = -2;
+	if (startPos)
+	    *ctxt->input = *startPos;
+    } else {
+	failed = -1;
+    }
+    goto done;
+
+done:
+    if (startPos != NULL)
+	xmlFree(startPos);
+
+    return failed;
+}
+
+/**
+ * htmlParseElement:
+ * @ctxt:  an HTML parser context
+ *
+ * parse an HTML element, this is highly recursive
+ * this is kept for compatibility with previous code versions
+ *
+ * [39] element ::= EmptyElemTag | STag content ETag
+ *
+ * [41] Attribute ::= Name Eq AttValue
+ */
+
+void
+htmlParseElement(htmlParserCtxtPtr ctxt) {
+    htmlParseElementRecursive(ctxt);
 }
 
 static void
@@ -4352,7 +4439,7 @@
 }
 
 /**
- * htmlParseElementInternal:
+ * htmlParseElementNonRecursive:
  * @ctxt:  an HTML parser context
  *
  * parse an HTML element, new version, non recursive
@@ -4360,23 +4447,28 @@
  * [39] element ::= EmptyElemTag | STag content ETag
  *
  * [41] Attribute ::= Name Eq AttValue
+ * 
+ * Returns 0: success,
+ *        -1: error - consumed tag input (that is dumped invalid tag),
+ *        -2: error - rewinded input to tag start - handle next char as text
  */
 
-static void
-htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
+static int
+htmlParseElementNonRecursive(htmlParserCtxtPtr ctxt) {
     const xmlChar *name;
     const htmlElemDesc * info;
     htmlParserNodeInfo node_info = { 0, };
     int failed;
+    xmlParserInputPtr startPos = NULL;
 
     if ((ctxt == NULL) || (ctxt->input == NULL)) {
 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
-		     "htmlParseElementInternal: context error\n", NULL, NULL);
-	return;
+		     "htmlParseElementNonRecursive: context error\n", NULL, NULL);
+	return -1;
     }
 
     if (ctxt->instate == XML_PARSER_EOF)
-        return;
+        return -1;
 
     /* Capture start position */
     if (ctxt->record_info) {
@@ -4384,13 +4476,18 @@
                           (CUR_PTR - ctxt->input->base);
 	node_info.begin_line = ctxt->input->line;
     }
+    startPos = xmlNewInputStream(ctxt);
+    if (startPos != NULL)
+	*startPos = *ctxt->input;
+    else
+	return -1;
 
     failed = htmlParseStartTag(ctxt);
     name = ctxt->name;
     if ((failed == -1) || (name == NULL)) {
 	if (CUR == '>')
 	    NEXT;
-        return;
+	goto errorDoRewind;
     }
 
     /*
@@ -4410,7 +4507,7 @@
 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
 	    ctxt->sax->endElement(ctxt->userData, name);
 	htmlnamePop(ctxt);
-	return;
+	goto errorDoConsume;
     }
 
     if (CUR == '>') {
@@ -4430,7 +4527,7 @@
         if (ctxt->record_info)
             htmlNodeInfoPush(ctxt, &node_info);
         htmlParserFinishElementParsing(ctxt);
-	return;
+	goto errorDoConsume;
     }
 
     /*
@@ -4440,26 +4537,56 @@
 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
 	    ctxt->sax->endElement(ctxt->userData, name);
 	htmlnamePop(ctxt);
-	return;
+	goto errorDoConsume;
     }
 
     if (ctxt->record_info)
         htmlNodeInfoPush(ctxt, &node_info);
+
+    /* Success */
+    failed = 0;
+    goto done;
+
+    /* Error occurred, leave input consumed. */
+errorDoConsume:
+    failed = -1;
+    goto done;
+
+    /* 
+     * Error occurred, rewind parser to tag start position.
+     * (but only if HTML_PARSE_RECOVER is set, otherwise leave input consumed)
+     */
+errorDoRewind:
+    if (ctxt->recovery) {
+	failed = -2;
+	if (startPos)
+	    *ctxt->input = *startPos;
+    } else {
+	failed = -1;
+    }
+    goto done;
+
+done:
+    if (startPos != NULL)
+	xmlFree(startPos);
+
+    return failed;
 }
 
 /**
- * htmlParseContentInternal:
+ * htmlParseContentNonRecursive:
  * @ctxt:  an HTML parser context
  *
  * Parse a content: comment, sub-element, reference or text.
- * New version for non recursive htmlParseElementInternal
+ * New version for non recursive htmlParseElementNonRecursive
  */
 
 static void
-htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
+htmlParseContentNonRecursive(htmlParserCtxtPtr ctxt) {
     xmlChar *currentNode;
     int depth;
     const xmlChar *name;
+    int res;
 
     currentNode = xmlStrdup(ctxt->name);
     depth = ctxt->nameNr;
@@ -4570,11 +4697,20 @@
 	     * Third case :  a sub-element.
 	     */
 	    else if (CUR == '<') {
-		htmlParseElementInternal(ctxt);
+		res = htmlParseElementNonRecursive(ctxt);
 		if (currentNode != NULL) xmlFree(currentNode);
 
 		currentNode = xmlStrdup(ctxt->name);
 		depth = ctxt->nameNr;
+
+		/*
+		 * If it was an invalid element tag and parser position was
+		 * rewinded to tag start, then consume next input as text.
+		 */
+		if (res == -2) {
+		    NEXT;
+		    htmlParseCharDataInternal(ctxt, '<');
+		}
 	    }
 
 	    /*
@@ -4618,14 +4754,14 @@
  * htmlParseContent:
  * @ctxt:  an HTML parser context
  *
- * Parse a content: comment, sub-element, reference or text.
+ * Parse a content non-recursive: comment, sub-element, reference or text.
  * This is the entry point when called from parser.c
  */
 
 void
 __htmlParseContent(void *ctxt) {
     if (ctxt != NULL)
-	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
+	htmlParseContentNonRecursive((htmlParserCtxtPtr) ctxt);
 }
 
 /**
@@ -4732,7 +4868,7 @@
     /*
      * Time to start parsing the tree itself
      */
-    htmlParseContentInternal(ctxt);
+    htmlParseContentNonRecursive(ctxt);
 
     /*
      * autoclose

_______________________________________________
xml mailing list, project page  http://xmlsoft.org/
xml@gnome.org
https://mail.gnome.org/mailman/listinfo/xml

Re: [xml] [PATCH] less-than character and HTML parser module

Reply via email to