https://github.com/python/cpython/commit/5dfb5e640ec03ff057dbd9f1e32bf6a493fd2f3d commit: 5dfb5e640ec03ff057dbd9f1e32bf6a493fd2f3d branch: 3.12 author: Miss Islington (bot) <31488909+miss-isling...@users.noreply.github.com> committer: bitdancer <rdmur...@bitdance.com> date: 2025-03-18T15:35:32-04:00 summary:
[3.12] gh-121284: Fix email address header folding with parsed encoded-word (GH-122754) (#131404) gh-121284: Fix email address header folding with parsed encoded-word (GH-122754) Email generators using email.policy.default may convert an RFC 2047 encoded-word to unencoded form during header refolding. In a structured header, this could allow 'specials' chars outside a quoted-string, leading to invalid address headers and enabling spoofing. This change ensures a parsed encoded-word that contains specials is kept as an encoded-word while the header is refolded. [Better fix from @bitdancer.] --------- (cherry picked from commit 295b53df2aa18deb625a7da41f7e4babfe6ef34b) Co-authored-by: Mike Edmunds <medmu...@gmail.com> Co-authored-by: R David Murray <rdmur...@bitdance.com> Co-authored-by: Petr Viktorin <encu...@gmail.com> files: A Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst M Lib/email/_header_value_parser.py M Lib/test/test_email/test__header_value_parser.py diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 3d845c09d415f6..9a51b9437333db 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1053,7 +1053,7 @@ def get_fws(value): fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws') return fws, newvalue -def get_encoded_word(value): +def get_encoded_word(value, terminal_type='vtext'): """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" """ @@ -1092,7 +1092,7 @@ def get_encoded_word(value): ew.append(token) continue chars, *remainder = _wsp_splitter(text, 1) - vtext = ValueTerminal(chars, 'vtext') + vtext = ValueTerminal(chars, terminal_type) _validate_xtext(vtext) ew.append(vtext) text = ''.join(remainder) @@ -1134,7 +1134,7 @@ def get_unstructured(value): valid_ew = True if value.startswith('=?'): try: - token, value = get_encoded_word(value) + token, value = get_encoded_word(value, 'utext') except _InvalidEwError: valid_ew = False except errors.HeaderParseError: @@ -1163,7 +1163,7 @@ def get_unstructured(value): # the parser to go in an infinite loop. if valid_ew and rfc2047_matcher.search(tok): tok, *remainder = value.partition('=?') - vtext = ValueTerminal(tok, 'vtext') + vtext = ValueTerminal(tok, 'utext') _validate_xtext(vtext) unstructured.append(vtext) value = ''.join(remainder) @@ -2813,7 +2813,7 @@ def _refold_parse_tree(parse_tree, *, policy): continue tstr = str(part) if not want_encoding: - if part.token_type == 'ptext': + if part.token_type in ('ptext', 'vtext'): # Encode if tstr contains special characters. want_encoding = not SPECIALSNL.isdisjoint(tstr) else: diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index efd1695711d7c1..69d4e9ab126d6d 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3076,6 +3076,31 @@ def test_address_list_with_unicode_names_in_quotes(self): '=?utf-8?q?H=C3=BCbsch?= Kaktus <beauti...@example.com>,\n' ' =?utf-8?q?bei=C3=9Ft_bei=C3=9Ft?= <bi...@example.com>\n') + def test_address_list_with_specials_in_encoded_word(self): + # An encoded-word parsed from a structured header must remain + # encoded when it contains specials. Regression for gh-121284. + policy = self.policy.clone(max_line_length=40) + cases = [ + # (to, folded) + ('=?utf-8?q?A_v=C3=A9ry_long_name_with=2C_comma?= <t...@example.com>', + 'A =?utf-8?q?v=C3=A9ry_long_name_with?=\n' + ' =?utf-8?q?=2C?= comma <t...@example.com>\n'), + ('=?utf-8?q?This_long_name_does_not_need_encoded=2Dword?= <t...@example.com>', + 'This long name does not need\n' + ' encoded-word <t...@example.com>\n'), + ('"A véry long name with, comma" <t...@example.com>', + # (This isn't the best fold point, but it's not invalid.) + 'A =?utf-8?q?v=C3=A9ry_long_name_with?=\n' + ' =?utf-8?q?=2C?= comma <t...@example.com>\n'), + ('"A véry long name containing a, comma" <t...@example.com>', + 'A =?utf-8?q?v=C3=A9ry?= long name\n' + ' containing =?utf-8?q?a=2C?= comma\n' + ' <t...@example.com>\n'), + ] + for (to, folded) in cases: + with self.subTest(to=to): + self._test(parser.get_address_list(to)[0], folded, policy=policy) + def test_address_list_with_list_separator_after_fold(self): a = 'x' * 66 + '@example.com' to = f'{a}, "Hübsch Kaktus" <beauti...@example.com>' diff --git a/Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst b/Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst new file mode 100644 index 00000000000000..923e91170d355f --- /dev/null +++ b/Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst @@ -0,0 +1,7 @@ +Fix bug in the folding of rfc2047 encoded-words when flattening an email message +using a modern email policy. Previously when an encoded-word was too long +for a line, it would be decoded, split across lines, and re-encoded. But commas +and other special characters in the original text could be left unencoded and +unquoted. This could theoretically be used to spoof header lines using +a carefully constructed encoded-word if the resulting rendered email was +transmitted or re-parsed. _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3/lists/python-checkins.python.org/ Member address: arch...@mail-archive.com