https://github.com/python/cpython/commit/dcf24768c918c41821cda6fe6a1aa20ce26545dd
commit: dcf24768c918c41821cda6fe6a1aa20ce26545dd
branch: 3.12
author: Serhiy Storchaka <[email protected]>
committer: Yhg1s <[email protected]>
date: 2025-10-06T16:06:29+02:00
summary:
[3.12] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (#137774)
"] ]>" and "]] >" no longer end the CDATA section.
Make CDATA section parsing context depending.
Add private method HTMLParser._set_support_cdata() to change the context.
If called with True, "<[CDATA[" starts a CDATA section which ends with "]]>".
If called with False, "<[CDATA[" starts a bogus comments which ends with ">".
(cherry picked from commit 0cbbfc462119b9107b373c24d2bda5a1271bed36)
files:
A Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst
M Lib/html/parser.py
M Lib/test/test_htmlparser.py
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index b8ee81ce80d5b8..9b7556592ba473 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -128,6 +128,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
+ self._support_cdata = True
self._escapable = True
super().reset()
@@ -165,6 +166,19 @@ def clear_cdata_mode(self):
self.cdata_elem = None
self._escapable = True
+ def _set_support_cdata(self, flag=True):
+ """Enable or disable support of the CDATA sections.
+ If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
+ If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
+
+ This method is not called by default. Its purpose is to be called
+ in custom handle_starttag() and handle_endtag() methods, with
+ value that depends on the adjusted current node.
+ See
https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
+ for details.
+ """
+ self._support_cdata = flag
+
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
@@ -239,7 +253,7 @@ def goahead(self, end):
j -= len(suffix)
break
self.handle_comment(rawdata[i+4:j])
- elif startswith("<![CDATA[", i):
+ elif startswith("<![CDATA[", i) and self._support_cdata:
self.unknown_decl(rawdata[i+3:])
elif rawdata[i:i+9].lower() == '<!doctype':
self.handle_decl(rawdata[i+2:])
@@ -315,8 +329,12 @@ def parse_html_declaration(self, i):
if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead()
return self.parse_comment(i)
- elif rawdata[i:i+3] == '<![':
- return self.parse_marked_section(i)
+ elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
+ j = rawdata.find(']]>', i+9)
+ if j < 0:
+ return -1
+ self.unknown_decl(rawdata[i+3: j])
+ return j + 3
elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing >
gtpos = rawdata.find('>', i+9)
@@ -324,6 +342,15 @@ def parse_html_declaration(self, i):
return -1
self.handle_decl(rawdata[i+2:gtpos])
return gtpos+1
+ elif rawdata[i:i+3] == '<![':
+ j = rawdata.find('>', i+3)
+ if j < 0:
+ return -1
+ if rawdata[j-1] == ']':
+ self.unknown_decl(rawdata[i+3: j-1])
+ else:
+ self.handle_comment(rawdata[i+2: j])
+ return j + 1
else:
return self.parse_bogus_comment(i)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 15f9714c1d0c6f..29f48098ae32ba 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -10,10 +10,13 @@
class EventCollector(html.parser.HTMLParser):
- def __init__(self, *args, **kw):
+ def __init__(self, *args, autocdata=False, **kw):
+ self.autocdata = autocdata
self.events = []
self.append = self.events.append
html.parser.HTMLParser.__init__(self, *args, **kw)
+ if autocdata:
+ self._set_support_cdata(False)
def get_events(self):
# Normalize the list of events so that buffer artefacts don't
@@ -34,12 +37,16 @@ def get_events(self):
def handle_starttag(self, tag, attrs):
self.append(("starttag", tag, attrs))
+ if self.autocdata and tag == 'svg':
+ self._set_support_cdata(True)
def handle_startendtag(self, tag, attrs):
self.append(("startendtag", tag, attrs))
def handle_endtag(self, tag):
self.append(("endtag", tag))
+ if self.autocdata and tag == 'svg':
+ self._set_support_cdata(False)
# all other markup
@@ -740,10 +747,6 @@ def test_eof_in_declarations(self):
('<!', [('comment', '')]),
('<!-', [('comment', '-')]),
('<![', [('comment', '[')]),
- ('<![CDATA[', [('unknown decl', 'CDATA[')]),
- ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
- ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
- ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
('<!DOCTYPE', [('decl', 'DOCTYPE')]),
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
@@ -756,6 +759,18 @@ def test_eof_in_declarations(self):
for html, expected in data:
self._run_check(html, expected)
+ @support.subTests('content', ['', 'x', 'x]', 'x]]'])
+ def test_eof_in_cdata(self, content):
+ self._run_check('<![CDATA[' + content,
+ [('unknown decl', 'CDATA[' + content)])
+ self._run_check('<![CDATA[' + content,
+ [('comment', '[CDATA[' + content)],
+ collector=EventCollector(autocdata=True))
+ self._run_check('<svg><text y="100"><![CDATA[' + content,
+ [('starttag', 'svg', []),
+ ('starttag', 'text', [('y', '100')]),
+ ('unknown decl', 'CDATA[' + content)])
+
def test_bogus_comments(self):
html = ('<!ELEMENT br EMPTY>'
'<! not really a comment >'
@@ -805,8 +820,57 @@ def test_broken_condcoms(self):
('startendtag', 'img', [('src', 'mammoth.bmp')]),
('unknown decl', 'endif')
]
+
self._run_check(html, expected)
+ @support.subTests('content', [
+ 'just some plain text',
+ '<!-- not a comment -->',
+ '¬-an-entity-ref;',
+ "<not a='start tag'>",
+ '',
+ '[[I have many brackets]]',
+ 'I have a > in the middle',
+ 'I have a ]] in the middle',
+ '] ]>',
+ ']] >',
+ ('\n'
+ ' if (a < b && a > b) {\n'
+ ' printf("[<marquee>How?</marquee>]");\n'
+ ' }\n'),
+ ])
+ def test_cdata_section_content(self, content):
+ # See "13.2.5.42 Markup declaration open state",
+ # "13.2.5.69 CDATA section state", and issue bpo-32876.
+ html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
+ expected = [
+ ('starttag', 'svg', []),
+ ('starttag', 'text', [('y', '100')]),
+ ('unknown decl', 'CDATA[' + content),
+ ('endtag', 'text'),
+ ('endtag', 'svg'),
+ ]
+ self._run_check(html, expected)
+ self._run_check(html, expected,
collector=EventCollector(autocdata=True))
+
+ def test_cdata_section(self):
+ # See "13.2.5.42 Markup declaration open state".
+ html = ('<![CDATA[foo<br>bar]]>'
+ '<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
+ '<![CDATA[foo<br>bar]]>')
+ expected = [
+ ('comment', '[CDATA[foo<br'),
+ ('data', 'bar]]>'),
+ ('starttag', 'svg', []),
+ ('starttag', 'text', [('y', '100')]),
+ ('unknown decl', 'CDATA[foo<br>bar'),
+ ('endtag', 'text'),
+ ('endtag', 'svg'),
+ ('comment', '[CDATA[foo<br'),
+ ('data', 'bar]]>'),
+ ]
+ self._run_check(html, expected,
collector=EventCollector(autocdata=True))
+
def test_convert_charrefs_dropped_text(self):
# #23144: make sure that all the events are triggered when
# convert_charrefs is True, even if we don't call .close()
diff --git
a/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst
b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst
new file mode 100644
index 00000000000000..fe000d936aae9d
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst
@@ -0,0 +1,5 @@
+Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to
+the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section.
+Add private method ``_set_support_cdata()`` which can be used to specify
+how to parse ``<[CDATA[`` --- as a CDATA section in foreign content
+(SVG or MathML) or as a bogus comment in the HTML namespace.
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]