[Python-checkins] gh-69426: HTMLParser: only unescape properly terminated character entities in attribute values (GH-95215)

serhiy-storchaka Wed, 07 May 2025 08:50:19 -0700

https://github.com/python/cpython/commit/77b14a6d58e527f915966446eb0866652a46feb5
commit: 77b14a6d58e527f915966446eb0866652a46feb5
branch: main
author: Sascha Ißbrücker <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2025-05-07T18:49:49+03:00
summary:


gh-69426: HTMLParser: only unescape properly terminated character entities in 
attribute values (GH-95215)

According to the HTML5 spec, named character references in attribute values
should only be processed if they are not followed by an ASCII alphanumeric,
or an equals sign.

https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state

files:
A Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst
M Lib/html/parser.py
M Lib/test/test_htmlparser.py

diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 13c95c34e505c8..0a1dd3b7d3bfd2 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -12,6 +12,7 @@
 import _markupbase
 
 from html import unescape
+from html.entities import html5 as html5_entities
 
 
 __all__ = ['HTMLParser']
@@ -23,6 +24,7 @@
 
 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+attr_charref = 
re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
 
 starttagopen = re.compile('<[a-zA-Z]')
 piclose = re.compile('>')
@@ -57,6 +59,22 @@
 # </ and the tag name, so maybe this should be fixed
 endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
 
+# Character reference processing logic specific to attribute values
+# See: 
https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
+def _replace_attr_charref(match):
+    ref = match.group(0)
+    # Numeric / hex char refs must always be unescaped
+    if ref.startswith('&#'):
+        return unescape(ref)
+    # Named character / entity references must only be unescaped
+    # if they are an exact match, and they are not followed by an equals sign
+    if not ref.endswith('=') and ref[1:] in html5_entities:
+        return unescape(ref)
+    # Otherwise do not unescape
+    return ref
+
+def _unescape_attrvalue(s):
+    return attr_charref.sub(_replace_attr_charref, s)
 
 
 class HTMLParser(_markupbase.ParserBase):
@@ -323,7 +341,7 @@ def parse_starttag(self, i):
                  attrvalue[:1] == '"' == attrvalue[-1:]:
                 attrvalue = attrvalue[1:-1]
             if attrvalue:
-                attrvalue = unescape(attrvalue)
+                attrvalue = _unescape_attrvalue(attrvalue)
             attrs.append((attrname.lower(), attrvalue))
             k = m.end()
 
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index b42a611c62c0aa..4fdba06cf4cc92 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -348,18 +348,16 @@ def test_convert_charrefs(self):
         collector = lambda: EventCollectorCharrefs()
         self.assertTrue(collector().convert_charrefs)
         charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
-        # check charrefs in the middle of the text/attributes
-        expected = [('starttag', 'a', [('href', 'foo"zar')]),
-                    ('data', 'a"z'), ('endtag', 'a')]
+        # check charrefs in the middle of the text
+        expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
         for charref in charrefs:
-            self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
+            self._run_check('<a>a{0}z</a>'.format(charref),
                             expected, collector=collector())
-        # check charrefs at the beginning/end of the text/attributes
-        expected = [('data', '"'),
-                    ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
+        # check charrefs at the beginning/end of the text
+        expected = [('data', '"'), ('starttag', 'a', []),
                     ('data', '"'), ('endtag', 'a'), ('data', '"')]
         for charref in charrefs:
-            self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
+            self._run_check('{0}<a>'
                             '{0}</a>{0}'.format(charref),
                             expected, collector=collector())
         # check charrefs in <script>/<style> elements
@@ -382,6 +380,35 @@ def test_convert_charrefs(self):
         self._run_check('no charrefs here', [('data', 'no charrefs here')],
                         collector=collector())
 
+    def test_convert_charrefs_in_attribute_values(self):
+        # default value for convert_charrefs is now True
+        collector = lambda: EventCollectorCharrefs()
+        self.assertTrue(collector().convert_charrefs)
+
+        # always unescape terminated entity refs, numeric and hex char refs:
+        # - regardless whether they are at start, middle, end of attribute
+        # - or followed by alphanumeric, non-alphanumeric, or equals char
+        charrefs = ['&cent;', '&#xa2;', '&#xa2', '&#162;', '&#162']
+        expected = [('starttag', 'a',
+                     [('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
+                      ('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
+                    ('endtag', 'a')]
+        for charref in charrefs:
+            self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
+                            '   x="z{0}z" x="{0} z" x="{0}=z"></a>'
+                            .format(charref), expected, collector=collector())
+
+        # only unescape unterminated entity matches if they are not followed by
+        # an alphanumeric or an equals sign
+        charref = '&cent'
+        expected = [('starttag', 'a',
+                     [('x', '¢'), ('x', 'z¢'), ('x', '&centz'),
+                      ('x', 'z&centz'), ('x', '¢ z'), ('x', '&cent=z')]),
+                    ('endtag', 'a')]
+        self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
+                        '   x="z{0}z" x="{0} z" x="{0}=z"></a>'
+                        .format(charref), expected, collector=collector())
+
     # the remaining tests were for the "tolerant" parser (which is now
     # the default), and check various kind of broken markup
     def test_tolerant_parsing(self):
diff --git 
a/Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst 
b/Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst
new file mode 100644
index 00000000000000..d8c081390d0669
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst
@@ -0,0 +1,3 @@
+Fix :class:`html.parser.HTMLParser` to not unescape character entities in
+attribute values if they are followed by an ASCII alphanumeric or an equals
+sign.

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]

[Python-checkins] gh-69426: HTMLParser: only unescape properly terminated character entities in attribute values (GH-95215)

Reply via email to