[Python-checkins] gh-101828: Fix `jisx0213` codecs removing null characters (gh-139340)

corona10 Fri, 17 Oct 2025 07:52:36 -0700

https://github.com/python/cpython/commit/87eadce3e0309d80a95e85d70a00028b5dca9907
commit: 87eadce3e0309d80a95e85d70a00028b5dca9907
branch: main
author: Stan Ulbrych <[email protected]>
committer: corona10 <[email protected]>
date: 2025-10-14T22:55:00+09:00
summary:


gh-101828: Fix `jisx0213` codecs removing null characters (gh-139340)

files:
A Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst
M Lib/test/multibytecodec_support.py
M Modules/cjkcodecs/_codecs_iso2022.c
M Modules/cjkcodecs/_codecs_jp.c

diff --git a/Lib/test/multibytecodec_support.py 
b/Lib/test/multibytecodec_support.py
index dbf0cc428e3ff6..6b4c57d0b4bad7 100644
--- a/Lib/test/multibytecodec_support.py
+++ b/Lib/test/multibytecodec_support.py
@@ -282,6 +282,23 @@ def test_incrementalencoder_del_segfault(self):
         with self.assertRaises(AttributeError):
             del e.errors
 
+    def test_null_terminator(self):
+        # see gh-101828
+        text = "フルーツ"
+        try:
+            text.encode(self.encoding)
+        except UnicodeEncodeError:
+            text = "Python is cool"
+        encode_w_null = (text + "\0").encode(self.encoding)
+        encode_plus_null = text.encode(self.encoding) + 
"\0".encode(self.encoding)
+        self.assertTrue(encode_w_null.endswith(b'\x00'))
+        self.assertEqual(encode_w_null, encode_plus_null)
+
+        encode_w_null_2 = (text + "\0" + text + "\0").encode(self.encoding)
+        encode_plus_null_2 = encode_plus_null + encode_plus_null
+        self.assertEqual(encode_w_null_2.count(b'\x00'), 2)
+        self.assertEqual(encode_w_null_2, encode_plus_null_2)
+
 
 class TestBase_Mapping(unittest.TestCase):
     pass_enctest = []
diff --git 
a/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst 
b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst
new file mode 100644
index 00000000000000..1d100180c072ec
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst
@@ -0,0 +1,3 @@
+Fix ``'shift_jisx0213'``, ``'shift_jis_2004'``, ``'euc_jisx0213'`` and
+``'euc_jis_2004'`` codecs truncating null chars
+as they were treated as part of multi-character sequences.
diff --git a/Modules/cjkcodecs/_codecs_iso2022.c 
b/Modules/cjkcodecs/_codecs_iso2022.c
index ef6faeb71274e1..b1984df2695b17 100644
--- a/Modules/cjkcodecs/_codecs_iso2022.c
+++ b/Modules/cjkcodecs/_codecs_iso2022.c
@@ -802,10 +802,13 @@ jisx0213_encoder(const MultibyteCodec *codec, const 
Py_UCS4 *data,
         return coded;
 
     case 2: /* second character of unicode pair */
-        coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
-                                jisx0213_pair_encmap, JISX0213_ENCPAIRS);
-        if (coded != DBCINV)
-            return coded;
+        if (data[1] != 0) { /* Don't consume null char as part of pair */
+            coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
+                                    jisx0213_pair_encmap, JISX0213_ENCPAIRS);
+            if (coded != DBCINV) {
+                return coded;
+            }
+        }
         _Py_FALLTHROUGH;
 
     case -1: /* flush unterminated */
diff --git a/Modules/cjkcodecs/_codecs_jp.c b/Modules/cjkcodecs/_codecs_jp.c
index f7127487aa5f59..cd77888d5514b8 100644
--- a/Modules/cjkcodecs/_codecs_jp.c
+++ b/Modules/cjkcodecs/_codecs_jp.c
@@ -192,8 +192,11 @@ ENCODER(euc_jis_2004)
                                 JISX0213_ENCPAIRS);
                             if (code == DBCINV)
                                 return 1;
-                        } else
+                        }
+                        else if (c2 != 0) {
+                            /* Don't consume null char as part of pair */
                             insize = 2;
+                        }
                     }
                 }
             }
@@ -611,8 +614,10 @@ ENCODER(shift_jis_2004)
                             if (code == DBCINV)
                                 return 1;
                             }
-                            else
+                            else if (ch2 != 0) {
+                                /* Don't consume null char as part of pair */
                                 insize = 2;
+                            }
                         }
                     }
                 }

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]

[Python-checkins] gh-101828: Fix `jisx0213` codecs removing null characters (gh-139340)

Reply via email to