[Python-checkins] gh-88726: Stop using non-standard charset names eucgb2312_cn and big5_tw in email (GH-149959)

serhiy-storchaka Tue, 26 May 2026 11:53:02 -0700

https://github.com/python/cpython/commit/5e467f4331d4cb7a8e2986c27af7eb68ccaccb37
commit: 5e467f4331d4cb7a8e2986c27af7eb68ccaccb37
branch: main
author: Serhiy Storchaka <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2026-05-26T21:52:47+03:00
summary:


gh-88726: Stop using non-standard charset names eucgb2312_cn and big5_tw in 
email (GH-149959)

files:
A Misc/NEWS.d/next/Library/2026-05-17-22-37-02.gh-issue-88726.BAoL6j.rst
M Lib/email/charset.py
M Lib/test/test_email/test_asian_codecs.py

diff --git a/Lib/email/charset.py b/Lib/email/charset.py
index 5036c3f58a5633c..c4b246455f86c64 100644
--- a/Lib/email/charset.py
+++ b/Lib/email/charset.py
@@ -93,8 +93,6 @@
 
 # Map charsets to their Unicode codec strings.
 CODEC_MAP = {
-    'gb2312':      'eucgb2312_cn',
-    'big5':        'big5_tw',
     # Hack: We don't want *any* conversion for stuff marked us-ascii, as all
     # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
     # Let that stuff pass through without conversion to/from Unicode.
diff --git a/Lib/test/test_email/test_asian_codecs.py 
b/Lib/test/test_email/test_asian_codecs.py
index ca44f54c69b39bc..85979ffd8169a75 100644
--- a/Lib/test/test_email/test_asian_codecs.py
+++ b/Lib/test/test_email/test_asian_codecs.py
@@ -58,6 +58,62 @@ def test_japanese_codecs(self):
         # TK: full decode comparison
         eq(str(h).encode(jcode), subject_bytes)
 
+        h = Header("Japanese")
+        s = '\u65e5\u672c\u8a9e' # 日本語
+        h.append(s, Charset('euc-jp'))
+        h.append(s, Charset('iso-2022-jp'))
+        h.append(s, Charset('shift_jis'))
+        eq(h.encode(), """\
+Japanese =?iso-2022-jp?b?GyRCRnxLXDhsGyhC?= =?iso-2022-jp?b?GyRCRnxLXDhsGyhC?=
+ =?iso-2022-jp?b?GyRCRnxLXDhsGyhC?=""")
+        eq(decode_header(h.encode()),
+           [(b'Japanese ', None),
+            (b'\x1b$BF|K\\8l\x1b(B\x1b$BF|K\\8l\x1b(B\x1b$BF|K\\8l\x1b(B', 
'iso-2022-jp'),
+           ])
+
+    def test_chinese_codecs(self):
+        eq = self.ndiffAssertEqual
+        h = Header("Chinese")
+        s = '\u4e2d\u6587' # 中文
+        h.append(s, Charset('gb2312'))
+        h.append(s, Charset('gbk'))
+        h.append(s, Charset('gb18030'))
+        h.append(s, Charset('hz'))
+        h.append(s, Charset('big5'))
+        h.append(s, Charset('big5hkscs'))
+        eq(h.encode(), """\
+Chinese =?gb2312?b?1tDOxA==?= =?gbk?b?1tDOxA==?= =?gb18030?b?1tDOxA==?=
+ =?hz?b?fntWUE5Efn0=?= =?big5?b?pKSk5Q==?= =?big5hkscs?b?pKSk5Q==?=""")
+        eq(decode_header(h.encode()),
+           [(b'Chinese ', None),
+            (b'\xd6\xd0\xce\xc4', 'gb2312'),
+            (b'\xd6\xd0\xce\xc4', 'gbk'),
+            (b'\xd6\xd0\xce\xc4', 'gb18030'),
+            (b'~{VPND~}', 'hz'),
+            (b'\xa4\xa4\xa4\xe5', 'big5'),
+            (b'\xa4\xa4\xa4\xe5', 'big5hkscs'),
+           ])
+
+    def test_korean_codecs(self):
+        eq = self.ndiffAssertEqual
+        h = Header("Korean")
+        s = '\ud55c\uad6d\uc5b4' # 한국어
+        h.append(s, Charset('euc-kr'))
+        h.append(s, Charset('ks_c_5601-1987'))
+        h.append(s, Charset('cp949'))
+        h.append(s, Charset('iso-2022-kr'))
+        h.append(s, Charset('johab'))
+        eq(h.encode(), """\
+Korean =?euc-kr?b?x9Gxub7u?= =?ks_c_5601-1987?b?x9Gxub7uIMfRsbm+7g==?=
+ =?iso-2022-kr?b?GyQpQw5HUTE5Pm4P?= =?johab?b?0GWKgrTh?=""")
+        eq(decode_header(h.encode()),
+           [(b'Korean ', None),
+            (b'\xc7\xd1\xb1\xb9\xbe\xee', 'euc-kr'),
+            (b'\xc7\xd1\xb1\xb9\xbe\xee \xc7\xd1\xb1\xb9\xbe\xee', 
'ks_c_5601-1987'),
+            (b'\x1b$)C\x0eGQ19>n\x0f', 'iso-2022-kr'),
+            (b'\xd0e\x8a\x82\xb4\xe1', 'johab'),
+           ])
+
     def test_payload_encoding_utf8(self):
         jhello = str(b'\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc'
                      b'\xa5\xeb\xa5\xc9\xa1\xaa', 'euc-jp')
diff --git 
a/Misc/NEWS.d/next/Library/2026-05-17-22-37-02.gh-issue-88726.BAoL6j.rst 
b/Misc/NEWS.d/next/Library/2026-05-17-22-37-02.gh-issue-88726.BAoL6j.rst
new file mode 100644
index 000000000000000..ba9058d79c9873a
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-05-17-22-37-02.gh-issue-88726.BAoL6j.rst
@@ -0,0 +1,2 @@
+The :mod:`email` package now uses standard MIME charset names "gb2312" and
+"big5" instead of non-standard names "eucgb2312_cn" and "big5_tw".

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]

[Python-checkins] gh-88726: Stop using non-standard charset names eucgb2312_cn and big5_tw in email (GH-149959)

Reply via email to