https://github.com/python/cpython/commit/1e17ccd030a2285ad53db5952360fffa33a8a877
commit: 1e17ccd030a2285ad53db5952360fffa33a8a877
branch: main
author: R. David Murray <[email protected]>
committer: bitdancer <[email protected]>
date: 2025-12-24T09:14:39-05:00
summary:
Correctly fold unknown-8bit originating from encoded words. (#142517)
The unknown-8bit trick was designed to deal with unknown bytes in an
ASCII message, and it works fine for that. However, I also tried to
extend it to handle bytes that can't be decoded using the charset
specified in an encoded word, and there it fails because there can be
other non-ASCII characters that were *successfully* decoded. The fix is
simple: do the unknown-8bit encoding using the utf-8 codec. This is
especially appropriate since anyone trying to do recovery on an unknown
byte string will probably attempt utf-8 first.
files:
A Misc/NEWS.d/next/Library/2025-12-10-10-00-06.gh-issue-142517.fG4hbe.rst
M Lib/email/_encoded_words.py
M Lib/test/test_email/test__header_value_parser.py
diff --git a/Lib/email/_encoded_words.py b/Lib/email/_encoded_words.py
index 6795a606de037e..05a34a4c105233 100644
--- a/Lib/email/_encoded_words.py
+++ b/Lib/email/_encoded_words.py
@@ -219,7 +219,7 @@ def encode(string, charset='utf-8', encoding=None, lang=''):
"""
if charset == 'unknown-8bit':
- bstring = string.encode('ascii', 'surrogateescape')
+ bstring = string.encode('utf-8', 'surrogateescape')
else:
bstring = string.encode(charset)
if encoding is None:
diff --git a/Lib/test/test_email/test__header_value_parser.py
b/Lib/test/test_email/test__header_value_parser.py
index f33844910beee4..426ec4644e3096 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -3340,5 +3340,13 @@ def
test_fold_unfoldable_element_stealing_whitespace(self):
token = parser.get_address_list(text)[0]
self._test(token, expected, policy=policy)
+ def test_encoded_word_with_undecodable_bytes(self):
+ self._test(parser.get_address_list(
+ ' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?='
+ )[0],
+ ' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?=\n',
+ )
+
+
if __name__ == '__main__':
unittest.main()
diff --git
a/Misc/NEWS.d/next/Library/2025-12-10-10-00-06.gh-issue-142517.fG4hbe.rst
b/Misc/NEWS.d/next/Library/2025-12-10-10-00-06.gh-issue-142517.fG4hbe.rst
new file mode 100644
index 00000000000000..388fff0e2acb96
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-10-10-00-06.gh-issue-142517.fG4hbe.rst
@@ -0,0 +1,4 @@
+The non-``compat32`` :mod:`email` policies now correctly handle refolding
+encoded words that contain bytes that can not be decoded in their specified
+character set. Previously this resulting in an encoding exception during
+folding.
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]