https://github.com/python/cpython/commit/a18b38172ab1d9cd0a25b6977271e062ecc4f3b0 commit: a18b38172ab1d9cd0a25b6977271e062ecc4f3b0 branch: 3.11 author: Serhiy Storchaka <[email protected]> committer: ambv <[email protected]> date: 2025-10-31T18:14:55+01:00 summary:
[3.11] gh-137836: Support more RAWTEXT and PLAINTEXT elements in HTMLParser (GH-137837) (GH-140842) (GH-140852) (cherry picked from commit a17c57eee5b5cc81390750d07e4800b19c0c3084) (cherry picked from commit 0329bd11c7e98484727bbb9062d53a8fa53ac7fd) Co-authored-by: Łukasz Langa <[email protected]> files: A Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst M Doc/library/html.parser.rst M Lib/html/parser.py M Lib/test/test_htmlparser.py diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index d35090111e0822..c6020925404667 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -15,14 +15,18 @@ This module defines a class :class:`HTMLParser` which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. -.. class:: HTMLParser(*, convert_charrefs=True) +.. class:: HTMLParser(*, convert_charrefs=True, scripting=False) Create a parser instance able to parse invalid markup. - If *convert_charrefs* is ``True`` (the default), all character - references (except the ones in ``script``/``style`` elements) are + If *convert_charrefs* is true (the default), all character + references (except the ones in elements like ``script`` and ``style``) are automatically converted to the corresponding Unicode characters. + If *scripting* is false (the default), the content of the ``noscript`` + element is parsed normally; if it's true, it's returned as is without + being parsed. + An :class:`.HTMLParser` instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are encountered. The user should subclass :class:`.HTMLParser` and override its @@ -37,6 +41,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. .. versionchanged:: 3.5 The default value for argument *convert_charrefs* is now ``True``. + .. versionchanged:: 3.11.15 + Added the *scripting* parameter. + Example HTML Parser Application ------------------------------- @@ -159,15 +166,15 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`): .. method:: HTMLParser.handle_data(data) This method is called to process arbitrary data (e.g. text nodes and the - content of ``<script>...</script>`` and ``<style>...</style>``). + content of elements like ``script`` and ``style``). .. method:: HTMLParser.handle_entityref(name) This method is called to process a named character reference of the form ``&name;`` (e.g. ``>``), where *name* is a general entity reference - (e.g. ``'gt'``). This method is never called if *convert_charrefs* is - ``True``. + (e.g. ``'gt'``). + This method is only called if *convert_charrefs* is false. .. method:: HTMLParser.handle_charref(name) @@ -175,8 +182,8 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`): This method is called to process decimal and hexadecimal numeric character references of the form :samp:`&#{NNN};` and :samp:`&#x{NNN};`. For example, the decimal equivalent for ``>`` is ``>``, whereas the hexadecimal is ``>``; - in this case the method will receive ``'62'`` or ``'x3E'``. This method - is never called if *convert_charrefs* is ``True``. + in this case the method will receive ``'62'`` or ``'x3E'``. + This method is only called if *convert_charrefs* is false. .. method:: HTMLParser.handle_comment(data) @@ -284,8 +291,8 @@ Parsing an element with a few attributes and a title:: Data : Python End tag : h1 -The content of ``script`` and ``style`` elements is returned as is, without -further parsing:: +The content of elements like ``script`` and ``style`` is returned as is, +without further parsing:: >>> parser.feed('<style type="text/css">#python { color: green }</style>') Start tag: style @@ -294,10 +301,10 @@ further parsing:: End tag : style >>> parser.feed('<script type="text/javascript">' - ... 'alert("<strong>hello!</strong>");</script>') + ... 'alert("<strong>hello! ☺</strong>");</script>') Start tag: script attr: ('type', 'text/javascript') - Data : alert("<strong>hello!</strong>"); + Data : alert("<strong>hello! ☺</strong>"); End tag : script Parsing comments:: @@ -317,7 +324,7 @@ correct char (note: these 3 references are all equivalent to ``'>'``):: Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but :meth:`~HTMLParser.handle_data` might be called more than once -(unless *convert_charrefs* is set to ``True``):: +if *convert_charrefs* is false:: >>> for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']: ... parser.feed(chunk) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 8eae9dc55e568c..fb3c13b873f93f 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -109,16 +109,24 @@ class HTMLParser(_markupbase.ParserBase): argument. """ - CDATA_CONTENT_ELEMENTS = ("script", "style") + # See the HTML5 specs section "13.4 Parsing HTML fragments". + # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments + # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode + CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes") RCDATA_CONTENT_ELEMENTS = ("textarea", "title") - def __init__(self, *, convert_charrefs=True): + def __init__(self, *, convert_charrefs=True, scripting=False): """Initialize and reset this instance. - If convert_charrefs is True (the default), all character references + If convert_charrefs is true (the default), all character references are automatically converted to the corresponding Unicode characters. + + If *scripting* is false (the default), the content of the + ``noscript`` element is parsed normally; if it's true, + it's returned as is without being parsed. """ self.convert_charrefs = convert_charrefs + self.scripting = scripting self.reset() def reset(self): @@ -153,7 +161,9 @@ def get_starttag_text(self): def set_cdata_mode(self, elem, *, escapable=False): self.cdata_elem = elem.lower() self._escapable = escapable - if escapable and not self.convert_charrefs: + if self.cdata_elem == 'plaintext': + self.interesting = re.compile(r'\Z') + elif escapable and not self.convert_charrefs: self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem, re.IGNORECASE|re.ASCII) else: @@ -434,8 +444,10 @@ def parse_starttag(self, i): self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) - if tag in self.CDATA_CONTENT_ELEMENTS: - self.set_cdata_mode(tag) + if (tag in self.CDATA_CONTENT_ELEMENTS or + (self.scripting and tag == "noscript") or + tag == "plaintext"): + self.set_cdata_mode(tag, escapable=False) elif tag in self.RCDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag, escapable=True) return endpos diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index a7be7a6e20224a..1c1be3ff476886 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -7,6 +7,18 @@ from test import support +SAMPLE_RCDATA = ( + '<!-- not a comment -->' + "<not a='start tag'>" + '<![CDATA[not a cdata]]>' + '<!not a bogus comment>' + '</not a bogus comment>' + '\u2603' +) + +SAMPLE_RAWTEXT = SAMPLE_RCDATA + '&☺' + + class EventCollector(html.parser.HTMLParser): def __init__(self, *args, autocdata=False, **kw): @@ -292,30 +304,20 @@ def test_get_starttag_text(self): 'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'), '\n<!-- //\nvar foo = 3.14;\n// -->\n', '<!-- \u2603 -->', - 'foo = "</ script>"', - 'foo = "</scripture>"', - 'foo = "</script\v>"', - 'foo = "</script\xa0>"', - 'foo = "</ſcript>"', - 'foo = "</scrıpt>"', ]) def test_script_content(self, content): s = f'<script>{content}</script>' - self._run_check(s, [("starttag", "script", []), - ("data", content), - ("endtag", "script")]) + self._run_check(s, [ + ("starttag", "script", []), + ("data", content), + ("endtag", "script"), + ]) @support.subTests('content', [ 'a::before { content: "<!-- not a comment -->"; }', 'a::before { content: "¬-an-entity-ref;"; }', 'a::before { content: "<not a=\'start tag\'>"; }', 'a::before { content: "\u2603"; }', - 'a::before { content: "< /style>"; }', - 'a::before { content: "</ style>"; }', - 'a::before { content: "</styled>"; }', - 'a::before { content: "</style\v>"; }', - 'a::before { content: "</style\xa0>"; }', - 'a::before { content: "</ſtyle>"; }', ]) def test_style_content(self, content): s = f'<style>{content}</style>' @@ -323,47 +325,59 @@ def test_style_content(self, content): ("data", content), ("endtag", "style")]) - @support.subTests('content', [ - '<!-- not a comment -->', - "<not a='start tag'>", - '<![CDATA[not a cdata]]>', - '<!not a bogus comment>', - '</not a bogus comment>', - '\u2603', - '< /title>', - '</ title>', - '</titled>', - '</title\v>', - '</title\xa0>', - '</tıtle>', + @support.subTests('tag', ['title', 'textarea']) + def test_rcdata_content(self, tag): + source = f"<{tag}>{SAMPLE_RCDATA}</{tag}>" + self._run_check(source, [ + ("starttag", tag, []), + ("data", SAMPLE_RCDATA), + ("endtag", tag), ]) - def test_title_content(self, content): - source = f"<title>{content}</title>" + source = f"<{tag}>&</{tag}>" self._run_check(source, [ - ("starttag", "title", []), - ("data", content), - ("endtag", "title"), + ("starttag", tag, []), + ('entityref', 'amp'), + ("endtag", tag), ]) - @support.subTests('content', [ - '<!-- not a comment -->', - "<not a='start tag'>", - '<![CDATA[not a cdata]]>', - '<!not a bogus comment>', - '</not a bogus comment>', - '\u2603', - '< /textarea>', - '</ textarea>', - '</textareable>', - '</textarea\v>', - '</textarea\xa0>', + @support.subTests('tag', + ['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script']) + def test_rawtext_content(self, tag): + source = f"<{tag}>{SAMPLE_RAWTEXT}</{tag}>" + self._run_check(source, [ + ("starttag", tag, []), + ("data", SAMPLE_RAWTEXT), + ("endtag", tag), + ]) + + def test_noscript_content(self): + source = f"<noscript>{SAMPLE_RAWTEXT}</noscript>" + # scripting=False -- normal mode + self._run_check(source, [ + ('starttag', 'noscript', []), + ('comment', ' not a comment '), + ('starttag', 'not', [('a', 'start tag')]), + ('unknown decl', 'CDATA[not a cdata'), + ('comment', 'not a bogus comment'), + ('endtag', 'not'), + ('data', '☃'), + ('entityref', 'amp'), + ('charref', '9786'), + ('endtag', 'noscript'), ]) - def test_textarea_content(self, content): - source = f"<textarea>{content}</textarea>" + # scripting=True -- RAWTEXT mode + self._run_check(source, [ + ("starttag", "noscript", []), + ("data", SAMPLE_RAWTEXT), + ("endtag", "noscript"), + ], collector=EventCollector(scripting=True)) + + def test_plaintext_content(self): + content = SAMPLE_RAWTEXT + '</plaintext>' # not closing + source = f"<plaintext>{content}" self._run_check(source, [ - ("starttag", "textarea", []), + ("starttag", "plaintext", []), ("data", content), - ("endtag", "textarea"), ]) @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n', @@ -380,52 +394,65 @@ def test_script_closing_tag(self, endtag): ("endtag", "script")], collector=EventCollectorNoNormalize(convert_charrefs=False)) - @support.subTests('endtag', ['style', 'STYLE', 'style ', 'style\n', - 'style/', 'style foo=bar', 'style foo=">"']) - def test_style_closing_tag(self, endtag): - content = """ - b::before { content: "<!-- not a comment -->"; } - p::before { content: "¬-an-entity-ref;"; } - a::before { content: "<i>"; } - a::after { content: "</i>"; } - """ - s = f'<StyLE>{content}</{endtag}>' - self._run_check(s, [("starttag", "style", []), - ("data", content), - ("endtag", "style")], - collector=EventCollectorNoNormalize(convert_charrefs=False)) - - @support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n', - 'title/', 'title foo=bar', 'title foo=">"']) - def test_title_closing_tag(self, endtag): - content = "<!-- not a comment --><i>Egg & Spam</i>" - s = f'<TitLe>{content}</{endtag}>' - self._run_check(s, [("starttag", "title", []), - ('data', '<!-- not a comment --><i>Egg & Spam</i>'), - ("endtag", "title")], - collector=EventCollectorNoNormalize(convert_charrefs=True)) - self._run_check(s, [("starttag", "title", []), - ('data', '<!-- not a comment --><i>Egg '), - ('entityref', 'amp'), - ('data', ' Spam</i>'), - ("endtag", "title")], - collector=EventCollectorNoNormalize(convert_charrefs=False)) - - @support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n', - 'textarea/', 'textarea foo=bar', 'textarea foo=">"']) - def test_textarea_closing_tag(self, endtag): - content = "<!-- not a comment --><i>Egg & Spam</i>" - s = f'<TexTarEa>{content}</{endtag}>' - self._run_check(s, [("starttag", "textarea", []), - ('data', '<!-- not a comment --><i>Egg & Spam</i>'), - ("endtag", "textarea")], - collector=EventCollectorNoNormalize(convert_charrefs=True)) - self._run_check(s, [("starttag", "textarea", []), - ('data', '<!-- not a comment --><i>Egg '), - ('entityref', 'amp'), - ('data', ' Spam</i>'), - ("endtag", "textarea")], - collector=EventCollectorNoNormalize(convert_charrefs=False)) + @support.subTests('tag', [ + 'script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', + 'textarea', 'title', 'noscript', + ]) + def test_closing_tag(self, tag): + for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n', + f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']: + content = "<!-- not a comment --><i>Spam</i>" + s = f'<{tag.upper()}>{content}</{endtag}>' + self._run_check(s, [ + ("starttag", tag, []), + ('data', content), + ("endtag", tag), + ], collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True)) + + @support.subTests('tag', [ + 'script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', + 'textarea', 'title', 'noscript', + ]) + def test_invalid_closing_tag(self, tag): + content = ( + f'< /{tag}>' + f'</ {tag}>' + f'</{tag}x>' + f'</{tag}\v>' + f'</{tag}\xa0>' + ) + source = f"<{tag}>{content}</{tag}>" + self._run_check(source, [ + ("starttag", tag, []), + ("data", content), + ("endtag", tag), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + + @support.subTests('tag,endtag', [ + ('title', 'tıtle'), + ('style', 'ſtyle'), + ('style', 'ſtyle'), + ('style', 'style'), + ('iframe', 'ıframe'), + ('noframes', 'noframeſ'), + ('noscript', 'noſcript'), + ('noscript', 'noscrıpt'), + ('script', 'ſcript'), + ('script', 'scrıpt'), + ]) + def test_invalid_nonascii_closing_tag(self, tag, endtag): + content = f"<br></{endtag}>" + source = f"<{tag}>{content}" + self._run_check(source, [ + ("starttag", tag, []), + ("data", content), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + source = f"<{tag}>{content}</{tag}>" + self._run_check(source, [ + ("starttag", tag, []), + ("data", content), + ("endtag", tag), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) @support.subTests('tail,end', [ ('', False), diff --git a/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst new file mode 100644 index 00000000000000..c30c9439a76a19 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst @@ -0,0 +1,3 @@ +Add support of the "plaintext" element, RAWTEXT elements "xmp", "iframe", +"noembed" and "noframes", and optionally RAWTEXT element "noscript" in +:class:`html.parser.HTMLParser`. _______________________________________________ Python-checkins mailing list -- [email protected] To unsubscribe send an email to [email protected] https://mail.python.org/mailman3//lists/python-checkins.python.org Member address: [email protected]
