https://github.com/python/cpython/commit/77b14a6d58e527f915966446eb0866652a46feb5 commit: 77b14a6d58e527f915966446eb0866652a46feb5 branch: main author: Sascha Ißbrücker <sascha.issbruec...@googlemail.com> committer: serhiy-storchaka <storch...@gmail.com> date: 2025-05-07T18:49:49+03:00 summary:
gh-69426: HTMLParser: only unescape properly terminated character entities in attribute values (GH-95215) According to the HTML5 spec, named character references in attribute values should only be processed if they are not followed by an ASCII alphanumeric, or an equals sign. https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state files: A Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst M Lib/html/parser.py M Lib/test/test_htmlparser.py diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 13c95c34e505c8..0a1dd3b7d3bfd2 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -12,6 +12,7 @@ import _markupbase from html import unescape +from html.entities import html5 as html5_entities __all__ = ['HTMLParser'] @@ -23,6 +24,7 @@ entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') +attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?') starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') @@ -57,6 +59,22 @@ # </ and the tag name, so maybe this should be fixed endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') +# Character reference processing logic specific to attribute values +# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state +def _replace_attr_charref(match): + ref = match.group(0) + # Numeric / hex char refs must always be unescaped + if ref.startswith('&#'): + return unescape(ref) + # Named character / entity references must only be unescaped + # if they are an exact match, and they are not followed by an equals sign + if not ref.endswith('=') and ref[1:] in html5_entities: + return unescape(ref) + # Otherwise do not unescape + return ref + +def _unescape_attrvalue(s): + return attr_charref.sub(_replace_attr_charref, s) class HTMLParser(_markupbase.ParserBase): @@ -323,7 +341,7 @@ def parse_starttag(self, i): attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] if attrvalue: - attrvalue = unescape(attrvalue) + attrvalue = _unescape_attrvalue(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index b42a611c62c0aa..4fdba06cf4cc92 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -348,18 +348,16 @@ def test_convert_charrefs(self): collector = lambda: EventCollectorCharrefs() self.assertTrue(collector().convert_charrefs) charrefs = ['"', '"', '"', '"', '"', '"'] - # check charrefs in the middle of the text/attributes - expected = [('starttag', 'a', [('href', 'foo"zar')]), - ('data', 'a"z'), ('endtag', 'a')] + # check charrefs in the middle of the text + expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')] for charref in charrefs: - self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref), + self._run_check('<a>a{0}z</a>'.format(charref), expected, collector=collector()) - # check charrefs at the beginning/end of the text/attributes - expected = [('data', '"'), - ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]), + # check charrefs at the beginning/end of the text + expected = [('data', '"'), ('starttag', 'a', []), ('data', '"'), ('endtag', 'a'), ('data', '"')] for charref in charrefs: - self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">' + self._run_check('{0}<a>' '{0}</a>{0}'.format(charref), expected, collector=collector()) # check charrefs in <script>/<style> elements @@ -382,6 +380,35 @@ def test_convert_charrefs(self): self._run_check('no charrefs here', [('data', 'no charrefs here')], collector=collector()) + def test_convert_charrefs_in_attribute_values(self): + # default value for convert_charrefs is now True + collector = lambda: EventCollectorCharrefs() + self.assertTrue(collector().convert_charrefs) + + # always unescape terminated entity refs, numeric and hex char refs: + # - regardless whether they are at start, middle, end of attribute + # - or followed by alphanumeric, non-alphanumeric, or equals char + charrefs = ['¢', '¢', '¢', '¢', '¢'] + expected = [('starttag', 'a', + [('x', '¢'), ('x', 'z¢'), ('x', '¢z'), + ('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]), + ('endtag', 'a')] + for charref in charrefs: + self._run_check('<a x="{0}" x="z{0}" x="{0}z" ' + ' x="z{0}z" x="{0} z" x="{0}=z"></a>' + .format(charref), expected, collector=collector()) + + # only unescape unterminated entity matches if they are not followed by + # an alphanumeric or an equals sign + charref = '¢' + expected = [('starttag', 'a', + [('x', '¢'), ('x', 'z¢'), ('x', '¢z'), + ('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]), + ('endtag', 'a')] + self._run_check('<a x="{0}" x="z{0}" x="{0}z" ' + ' x="z{0}z" x="{0} z" x="{0}=z"></a>' + .format(charref), expected, collector=collector()) + # the remaining tests were for the "tolerant" parser (which is now # the default), and check various kind of broken markup def test_tolerant_parsing(self): diff --git a/Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst b/Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst new file mode 100644 index 00000000000000..d8c081390d0669 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst @@ -0,0 +1,3 @@ +Fix :class:`html.parser.HTMLParser` to not unescape character entities in +attribute values if they are followed by an ASCII alphanumeric or an equals +sign. _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3/lists/python-checkins.python.org/ Member address: arch...@mail-archive.com