https://github.com/python/cpython/commit/87eadce3e0309d80a95e85d70a00028b5dca9907
commit: 87eadce3e0309d80a95e85d70a00028b5dca9907
branch: main
author: Stan Ulbrych <[email protected]>
committer: corona10 <[email protected]>
date: 2025-10-14T22:55:00+09:00
summary:
gh-101828: Fix `jisx0213` codecs removing null characters (gh-139340)
files:
A Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst
M Lib/test/multibytecodec_support.py
M Modules/cjkcodecs/_codecs_iso2022.c
M Modules/cjkcodecs/_codecs_jp.c
diff --git a/Lib/test/multibytecodec_support.py
b/Lib/test/multibytecodec_support.py
index dbf0cc428e3ff6..6b4c57d0b4bad7 100644
--- a/Lib/test/multibytecodec_support.py
+++ b/Lib/test/multibytecodec_support.py
@@ -282,6 +282,23 @@ def test_incrementalencoder_del_segfault(self):
with self.assertRaises(AttributeError):
del e.errors
+ def test_null_terminator(self):
+ # see gh-101828
+ text = "γγ«γΌγ"
+ try:
+ text.encode(self.encoding)
+ except UnicodeEncodeError:
+ text = "Python is cool"
+ encode_w_null = (text + "\0").encode(self.encoding)
+ encode_plus_null = text.encode(self.encoding) +
"\0".encode(self.encoding)
+ self.assertTrue(encode_w_null.endswith(b'\x00'))
+ self.assertEqual(encode_w_null, encode_plus_null)
+
+ encode_w_null_2 = (text + "\0" + text + "\0").encode(self.encoding)
+ encode_plus_null_2 = encode_plus_null + encode_plus_null
+ self.assertEqual(encode_w_null_2.count(b'\x00'), 2)
+ self.assertEqual(encode_w_null_2, encode_plus_null_2)
+
class TestBase_Mapping(unittest.TestCase):
pass_enctest = []
diff --git
a/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst
b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst
new file mode 100644
index 00000000000000..1d100180c072ec
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst
@@ -0,0 +1,3 @@
+Fix ``'shift_jisx0213'``, ``'shift_jis_2004'``, ``'euc_jisx0213'`` and
+``'euc_jis_2004'`` codecs truncating null chars
+as they were treated as part of multi-character sequences.
diff --git a/Modules/cjkcodecs/_codecs_iso2022.c
b/Modules/cjkcodecs/_codecs_iso2022.c
index ef6faeb71274e1..b1984df2695b17 100644
--- a/Modules/cjkcodecs/_codecs_iso2022.c
+++ b/Modules/cjkcodecs/_codecs_iso2022.c
@@ -802,10 +802,13 @@ jisx0213_encoder(const MultibyteCodec *codec, const
Py_UCS4 *data,
return coded;
case 2: /* second character of unicode pair */
- coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
- jisx0213_pair_encmap, JISX0213_ENCPAIRS);
- if (coded != DBCINV)
- return coded;
+ if (data[1] != 0) { /* Don't consume null char as part of pair */
+ coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
+ jisx0213_pair_encmap, JISX0213_ENCPAIRS);
+ if (coded != DBCINV) {
+ return coded;
+ }
+ }
_Py_FALLTHROUGH;
case -1: /* flush unterminated */
diff --git a/Modules/cjkcodecs/_codecs_jp.c b/Modules/cjkcodecs/_codecs_jp.c
index f7127487aa5f59..cd77888d5514b8 100644
--- a/Modules/cjkcodecs/_codecs_jp.c
+++ b/Modules/cjkcodecs/_codecs_jp.c
@@ -192,8 +192,11 @@ ENCODER(euc_jis_2004)
JISX0213_ENCPAIRS);
if (code == DBCINV)
return 1;
- } else
+ }
+ else if (c2 != 0) {
+ /* Don't consume null char as part of pair */
insize = 2;
+ }
}
}
}
@@ -611,8 +614,10 @@ ENCODER(shift_jis_2004)
if (code == DBCINV)
return 1;
}
- else
+ else if (ch2 != 0) {
+ /* Don't consume null char as part of pair */
insize = 2;
+ }
}
}
}
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]