https://github.com/python/cpython/commit/8ac7613dc8b8f82253d7c0e2b6ef6ed703a0a1ee
commit: 8ac7613dc8b8f82253d7c0e2b6ef6ed703a0a1ee
branch: main
author: Serhiy Storchaka <storch...@gmail.com>
committer: serhiy-storchaka <storch...@gmail.com>
date: 2025-07-04T07:00:23Z
summary:

gh-102555: Fix comment parsing in HTMLParser according to the HTML5 standard 
(GH-135664)

* "--!>" now ends the comment.
* "-- >" no longer ends the comment.
* Support abnormally ended empty comments "<-->" and "<--->".

---------

Co-author: Kerim Kabirov <the.privat33r...@pm.me>
Co-authored-by: Ezio Melotti <ezio.melo...@gmail.com>

files:
A Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst
M Lib/html/parser.py
M Lib/test/test_htmlparser.py

diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index cc15de07b5bae6..9b4f09599134bd 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -29,7 +29,8 @@
 starttagopen = re.compile('<[a-zA-Z]')
 endtagopen = re.compile('</[a-zA-Z]')
 piclose = re.compile('>')
-commentclose = re.compile(r'--\s*>')
+commentclose = re.compile(r'--!?>')
+commentabruptclose = re.compile(r'-?>')
 # Note:
 #  1) if you change tagfind/attrfind remember to update locatetagend too;
 #  2) if you change tagfind/attrfind and/or locatetagend the parser will
@@ -336,6 +337,21 @@ def parse_html_declaration(self, i):
         else:
             return self.parse_bogus_comment(i)
 
+    # Internal -- parse comment, return length or -1 if not terminated
+    # see 
https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
+    def parse_comment(self, i, report=True):
+        rawdata = self.rawdata
+        assert rawdata.startswith('<!--', i), 'unexpected call to 
parse_comment()'
+        match = commentclose.search(rawdata, i+4)
+        if not match:
+            match = commentabruptclose.match(rawdata, i+4)
+            if not match:
+                return -1
+        if report:
+            j = match.start()
+            self.handle_comment(rawdata[i+4: j])
+        return match.end()
+
     # Internal -- parse bogus comment, return length or -1 if not terminated
     # see 
https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
     def parse_bogus_comment(self, i, report=1):
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index d0d2c54217ccaf..15cad061889a79 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -367,17 +367,45 @@ def test_comments(self):
         html = ("<!-- I'm a valid comment -->"
                 '<!--me too!-->'
                 '<!------>'
+                '<!----->'
                 '<!---->'
+                # abrupt-closing-of-empty-comment
+                '<!--->'
+                '<!-->'
                 '<!----I have many hyphens---->'
                 '<!-- I have a > in the middle -->'
-                '<!-- and I have -- in the middle! -->')
+                '<!-- and I have -- in the middle! -->'
+                '<!--incorrectly-closed-comment--!>'
+                '<!----!>'
+                '<!----!-->'
+                '<!---- >-->'
+                '<!---!>-->'
+                '<!--!>-->'
+                # nested-comment
+                '<!-- <!-- nested --> -->'
+                '<!--<!-->'
+                '<!--<!--!>'
+        )
         expected = [('comment', " I'm a valid comment "),
                     ('comment', 'me too!'),
                     ('comment', '--'),
+                    ('comment', '-'),
+                    ('comment', ''),
+                    ('comment', ''),
                     ('comment', ''),
                     ('comment', '--I have many hyphens--'),
                     ('comment', ' I have a > in the middle '),
-                    ('comment', ' and I have -- in the middle! ')]
+                    ('comment', ' and I have -- in the middle! '),
+                    ('comment', 'incorrectly-closed-comment'),
+                    ('comment', ''),
+                    ('comment', '--!'),
+                    ('comment', '-- >'),
+                    ('comment', '-!>'),
+                    ('comment', '!>'),
+                    ('comment', ' <!-- nested '), ('data', ' -->'),
+                    ('comment', '<!'),
+                    ('comment', '<!'),
+        ]
         self._run_check(html, expected)
 
     def test_condcoms(self):
diff --git 
a/Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst 
b/Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst
new file mode 100644
index 00000000000000..71d15ee0852ebd
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst
@@ -0,0 +1,3 @@
+Fix comment parsing in :class:`html.parser.HTMLParser` according to the
+HTML5 standard. ``--!>`` now ends the comment. ``-- >`` no longer ends the
+comment. Support abnormally ended empty comments ``<-->`` and ``<--->``.

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-le...@python.org
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: arch...@mail-archive.com

Reply via email to