https://github.com/python/cpython/commit/435e891b32318b4a0fed2843d0cb37ee21c07e4b
commit: 435e891b32318b4a0fed2843d0cb37ee21c07e4b
branch: 3.11
author: Miss Islington (bot) <31488909+miss-isling...@users.noreply.github.com>
committer: serhiy-storchaka <storch...@gmail.com>
date: 2024-01-10T13:24:17Z
summary:

[3.11] gh-113594: Fix UnicodeEncodeError in TokenList.fold() (GH-113730) 
(GH-113908)

It occurred when try to re-encode an unknown-8bit part combined with 
non-unknown-8bit part.
(cherry picked from commit e9d5b6ea2d68564f176fdf70c2d7028e060c62b5)

Co-authored-by: Serhiy Storchaka <storch...@gmail.com>

files:
A Misc/NEWS.d/next/Library/2024-01-05-12-42-07.gh-issue-113594.4t8HiR.rst
M Lib/email/_header_value_parser.py
M Lib/test/test_email/test__header_value_parser.py

diff --git a/Lib/email/_header_value_parser.py 
b/Lib/email/_header_value_parser.py
index e637e6df06612d..f4334f1fe69cbe 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -2768,6 +2768,7 @@ def _refold_parse_tree(parse_tree, *, policy):
     encoding = 'utf-8' if policy.utf8 else 'us-ascii'
     lines = ['']
     last_ew = None
+    last_charset = None
     wrap_as_ew_blocked = 0
     want_encoding = False
     end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
@@ -2822,8 +2823,14 @@ def _refold_parse_tree(parse_tree, *, policy):
             else:
                 # It's a terminal, wrap it as an encoded word, possibly
                 # combining it with previously encoded words if allowed.
+                if (last_ew is not None and
+                    charset != last_charset and
+                    (last_charset == 'unknown-8bit' or
+                     last_charset == 'utf-8' and charset != 'us-ascii')):
+                    last_ew = None
                 last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
                                       part.ew_combine_allowed, charset)
+                last_charset = charset
             want_encoding = False
             continue
         if len(tstr) <= maxlen - len(lines[-1]):
diff --git a/Lib/test/test_email/test__header_value_parser.py 
b/Lib/test/test_email/test__header_value_parser.py
index 854f2ff009c618..bdb0e55f21069f 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -2915,6 +2915,45 @@ def test_ews_combined_before_wrap(self):
                         "mich.  And that's\n"
                    " all I'm sayin.\n")
 
+    def test_unicode_after_unknown_not_combined(self):
+        self._test(parser.get_unstructured("=?unknown-8bit?q?=A4?=\xa4"),
+                   "=?unknown-8bit?q?=A4?==?utf-8?q?=C2=A4?=\n")
+        prefix = "0123456789 "*5
+        self._test(parser.get_unstructured(prefix + 
"=?unknown-8bit?q?=A4?=\xa4"),
+                   prefix + "=?unknown-8bit?q?=A4?=\n =?utf-8?q?=C2=A4?=\n")
+
+    def test_ascii_after_unknown_not_combined(self):
+        self._test(parser.get_unstructured("=?unknown-8bit?q?=A4?=abc"),
+                   "=?unknown-8bit?q?=A4?=abc\n")
+        prefix = "0123456789 "*5
+        self._test(parser.get_unstructured(prefix + 
"=?unknown-8bit?q?=A4?=abc"),
+                   prefix + "=?unknown-8bit?q?=A4?=\n =?utf-8?q?abc?=\n")
+
+    def test_unknown_after_unicode_not_combined(self):
+        self._test(parser.get_unstructured("\xa4"
+                                           "=?unknown-8bit?q?=A4?="),
+                   "=?utf-8?q?=C2=A4?==?unknown-8bit?q?=A4?=\n")
+        prefix = "0123456789 "*5
+        self._test(parser.get_unstructured(prefix + 
"\xa4=?unknown-8bit?q?=A4?="),
+                   prefix + "=?utf-8?q?=C2=A4?=\n =?unknown-8bit?q?=A4?=\n")
+
+    def test_unknown_after_ascii_not_combined(self):
+        self._test(parser.get_unstructured("abc"
+                                           "=?unknown-8bit?q?=A4?="),
+                   "abc=?unknown-8bit?q?=A4?=\n")
+        prefix = "0123456789 "*5
+        self._test(parser.get_unstructured(prefix + 
"abcd=?unknown-8bit?q?=A4?="),
+                   prefix + "abcd\n =?unknown-8bit?q?=A4?=\n")
+
+    def test_unknown_after_unknown(self):
+        self._test(parser.get_unstructured("=?unknown-8bit?q?=C2?="
+                                           "=?unknown-8bit?q?=A4?="),
+                   "=?unknown-8bit?q?=C2=A4?=\n")
+        prefix = "0123456789 "*5
+        self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=C2?="
+                                           "=?unknown-8bit?q?=A4?="),
+                   prefix + "=?unknown-8bit?q?=C2?=\n 
=?unknown-8bit?q?=A4?=\n")
+
     # XXX Need test of an encoded word so long that it needs to be wrapped
 
     def test_simple_address(self):
diff --git 
a/Misc/NEWS.d/next/Library/2024-01-05-12-42-07.gh-issue-113594.4t8HiR.rst 
b/Misc/NEWS.d/next/Library/2024-01-05-12-42-07.gh-issue-113594.4t8HiR.rst
new file mode 100644
index 00000000000000..c71bc9c20e4596
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-01-05-12-42-07.gh-issue-113594.4t8HiR.rst
@@ -0,0 +1,2 @@
+Fix :exc:`UnicodeEncodeError` in :mod:`email` when re-fold lines that
+contain unknown-8bit encoded part followed by non-unknown-8bit encoded part.

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-le...@python.org
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: arch...@mail-archive.com

Reply via email to