https://github.com/python/cpython/commit/aa0c3d1098e7fdcc74b753aadf18dd07ddbc76b0 commit: aa0c3d1098e7fdcc74b753aadf18dd07ddbc76b0 branch: 3.13 author: Miss Islington (bot) <31488909+miss-isling...@users.noreply.github.com> committer: serhiy-storchaka <storch...@gmail.com> date: 2025-05-10T14:55:12Z summary:
[3.13] gh-77057: Fix handling of invalid markup declarations in HTMLParser (GH-9295) (GH-133834) (cherry picked from commit 76c0b01bc401c3e976011bbc69cec56dbebe0ad5) Co-authored-by: Ezio Melotti <ezio.melo...@gmail.com> Co-authored-by: Serhiy Storchaka <storch...@gmail.com> files: A Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst M Lib/html/parser.py M Lib/test/test_htmlparser.py diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 0a1dd3b7d3bfd2..1b8b6ea0e5ab7a 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -278,7 +278,7 @@ def parse_html_declaration(self, i): if rawdata[i:i+4] == '<!--': # this case is actually already handled in goahead() return self.parse_comment(i) - elif rawdata[i:i+3] == '<![': + elif rawdata[i:i+9] == '<![CDATA[': return self.parse_marked_section(i) elif rawdata[i:i+9].lower() == '<!doctype': # find the closing > @@ -295,7 +295,7 @@ def parse_html_declaration(self, i): def parse_bogus_comment(self, i, report=1): rawdata = self.rawdata assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to ' - 'parse_comment()') + 'parse_bogus_comment()') pos = rawdata.find('>', i+2) if pos == -1: return -1 diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 4fdba06cf4cc92..68649e9d6d5e9c 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -566,12 +566,33 @@ def test_EOF_in_charref(self): for html, expected in data: self._run_check(html, expected) - def test_broken_comments(self): + def test_EOF_in_comments_or_decls(self): + data = [ + ('<!', [('data', '<!')]), + ('<!-', [('data', '<!-')]), + ('<!--', [('data', '<!--')]), + ('<![', [('data', '<![')]), + ('<![CDATA[', [('data', '<![CDATA[')]), + ('<![CDATA[x', [('data', '<![CDATA[x')]), + ('<!DOCTYPE', [('data', '<!DOCTYPE')]), + ('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]), + ] + for html, expected in data: + self._run_check(html, expected) + def test_bogus_comments(self): html = ('<! not really a comment >' '<! not a comment either -->' '<! -- close enough -->' '<!><!<-- this was an empty comment>' - '<!!! another bogus comment !!!>') + '<!!! another bogus comment !!!>' + # see #32876 + '<![with square brackets]!>' + '<![\nmultiline\nbogusness\n]!>' + '<![more brackets]-[and a hyphen]!>' + '<![cdata[should be uppercase]]>' + '<![CDATA [whitespaces are not ignored]]>' + '<![CDATA]]>' # required '[' after CDATA + ) expected = [ ('comment', ' not really a comment '), ('comment', ' not a comment either --'), @@ -579,39 +600,65 @@ def test_broken_comments(self): ('comment', ''), ('comment', '<-- this was an empty comment'), ('comment', '!! another bogus comment !!!'), + ('comment', '[with square brackets]!'), + ('comment', '[\nmultiline\nbogusness\n]!'), + ('comment', '[more brackets]-[and a hyphen]!'), + ('comment', '[cdata[should be uppercase]]'), + ('comment', '[CDATA [whitespaces are not ignored]]'), + ('comment', '[CDATA]]'), ] self._run_check(html, expected) def test_broken_condcoms(self): # these condcoms are missing the '--' after '<!' and before the '>' + # and they are considered bogus comments according to + # "8.2.4.42. Markup declaration open state" html = ('<![if !(IE)]>broken condcom<![endif]>' '<![if ! IE]><link href="favicon.tiff"/><![endif]>' '<![if !IE 6]><img src="firefox.png" /><![endif]>' '<![if !ie 6]><b>foo</b><![endif]>' '<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>') - # According to the HTML5 specs sections "8.2.4.44 Bogus comment state" - # and "8.2.4.45 Markup declaration open state", comment tokens should - # be emitted instead of 'unknown decl', but calling unknown_decl - # provides more flexibility. - # See also Lib/_markupbase.py:parse_declaration expected = [ - ('unknown decl', 'if !(IE)'), + ('comment', '[if !(IE)]'), ('data', 'broken condcom'), - ('unknown decl', 'endif'), - ('unknown decl', 'if ! IE'), + ('comment', '[endif]'), + ('comment', '[if ! IE]'), ('startendtag', 'link', [('href', 'favicon.tiff')]), - ('unknown decl', 'endif'), - ('unknown decl', 'if !IE 6'), + ('comment', '[endif]'), + ('comment', '[if !IE 6]'), ('startendtag', 'img', [('src', 'firefox.png')]), - ('unknown decl', 'endif'), - ('unknown decl', 'if !ie 6'), + ('comment', '[endif]'), + ('comment', '[if !ie 6]'), ('starttag', 'b', []), ('data', 'foo'), ('endtag', 'b'), - ('unknown decl', 'endif'), - ('unknown decl', 'if (!IE)|(lt IE 9)'), + ('comment', '[endif]'), + ('comment', '[if (!IE)|(lt IE 9)]'), ('startendtag', 'img', [('src', 'mammoth.bmp')]), - ('unknown decl', 'endif') + ('comment', '[endif]') + ] + self._run_check(html, expected) + + def test_cdata_declarations(self): + # More tests should be added. See also "8.2.4.42. Markup + # declaration open state", "8.2.4.69. CDATA section state", + # and issue 32876 + html = ('<![CDATA[just some plain text]]>') + expected = [('unknown decl', 'CDATA[just some plain text')] + self._run_check(html, expected) + + def test_cdata_declarations_multiline(self): + html = ('<code><![CDATA[' + ' if (a < b && a > b) {' + ' printf("[<marquee>How?</marquee>]");' + ' }' + ']]></code>') + expected = [ + ('starttag', 'code', []), + ('unknown decl', + 'CDATA[ if (a < b && a > b) { ' + 'printf("[<marquee>How?</marquee>]"); }'), + ('endtag', 'code') ] self._run_check(html, expected) diff --git a/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst b/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst new file mode 100644 index 00000000000000..42107de75c7d29 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst @@ -0,0 +1,2 @@ +Fix handling of invalid markup declarations in +:class:`html.parser.HTMLParser`. _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3/lists/python-checkins.python.org/ Member address: arch...@mail-archive.com