https://github.com/python/cpython/commit/eaf217108226633c03cc5c4c90f0b6e4587c8803
commit: eaf217108226633c03cc5c4c90f0b6e4587c8803
branch: main
author: Serhiy Storchaka <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2024-11-21T13:15:12+02:00
summary:

gh-126997: Fix support of non-ASCII strings in pickletools (GH-127062)

* Fix support of STRING and GLOBAL opcodes with non-ASCII arguments.
* dis() now outputs non-ASCII bytes in STRING, BINSTRING and
  SHORT_BINSTRING arguments as escaped (\xXX).

files:
A Misc/NEWS.d/next/Library/2024-11-20-16-58-59.gh-issue-126997.0PI41Y.rst
M Lib/pickletools.py
M Lib/test/test_pickletools.py

diff --git a/Lib/pickletools.py b/Lib/pickletools.py
index c462d26da97ce1..d9c4fb1e63e91a 100644
--- a/Lib/pickletools.py
+++ b/Lib/pickletools.py
@@ -312,7 +312,7 @@ def read_uint8(f):
             doc="Eight-byte unsigned integer, little-endian.")
 
 
-def read_stringnl(f, decode=True, stripquotes=True):
+def read_stringnl(f, decode=True, stripquotes=True, *, encoding='latin-1'):
     r"""
     >>> import io
     >>> read_stringnl(io.BytesIO(b"'abcd'\nefg\n"))
@@ -356,7 +356,7 @@ def read_stringnl(f, decode=True, stripquotes=True):
             raise ValueError("no string quotes around %r" % data)
 
     if decode:
-        data = codecs.escape_decode(data)[0].decode("ascii")
+        data = codecs.escape_decode(data)[0].decode(encoding)
     return data
 
 stringnl = ArgumentDescriptor(
@@ -370,7 +370,7 @@ def read_stringnl(f, decode=True, stripquotes=True):
                    """)
 
 def read_stringnl_noescape(f):
-    return read_stringnl(f, stripquotes=False)
+    return read_stringnl(f, stripquotes=False, encoding='utf-8')
 
 stringnl_noescape = ArgumentDescriptor(
                         name='stringnl_noescape',
@@ -2509,7 +2509,10 @@ def dis(pickle, out=None, memo=None, indentlevel=4, 
annotate=0):
             # make a mild effort to align arguments
             line += ' ' * (10 - len(opcode.name))
             if arg is not None:
-                line += ' ' + repr(arg)
+                if opcode.name in ("STRING", "BINSTRING", "SHORT_BINSTRING"):
+                    line += ' ' + ascii(arg)
+                else:
+                    line += ' ' + repr(arg)
             if markmsg:
                 line += ' ' + markmsg
         if annotate:
diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py
index d8ff7a25cbc4b7..265dc497ccb86c 100644
--- a/Lib/test/test_pickletools.py
+++ b/Lib/test/test_pickletools.py
@@ -361,6 +361,88 @@ def test_annotate(self):
 highest protocol among opcodes = 0
 ''', annotate=20)
 
+    def test_string(self):
+        self.check_dis(b"S'abc'\n.", '''\
+    0: S    STRING     'abc'
+    7: .    STOP
+highest protocol among opcodes = 0
+''')
+        self.check_dis(b'S"abc"\n.', '''\
+    0: S    STRING     'abc'
+    7: .    STOP
+highest protocol among opcodes = 0
+''')
+        self.check_dis(b"S'\xc3\xb5'\n.", '''\
+    0: S    STRING     '\\xc3\\xb5'
+    6: .    STOP
+highest protocol among opcodes = 0
+''')
+
+    def test_string_without_quotes(self):
+        self.check_dis_error(b"Sabc'\n.", '',
+                             'no string quotes around b"abc\'"')
+        self.check_dis_error(b'Sabc"\n.', '',
+                             "no string quotes around b'abc\"'")
+        self.check_dis_error(b"S'abc\n.", '',
+                             '''strinq quote b"'" not found at both ends of 
b"'abc"''')
+        self.check_dis_error(b'S"abc\n.', '',
+                             r"""strinq quote b'"' not found at both ends of 
b'"abc'""")
+        self.check_dis_error(b"S'abc\"\n.", '',
+                             r"""strinq quote b"'" not found at both ends of 
b'\\'abc"'""")
+        self.check_dis_error(b"S\"abc'\n.", '',
+                             r"""strinq quote b'"' not found at both ends of 
b'"abc\\''""")
+
+    def test_binstring(self):
+        self.check_dis(b"T\x03\x00\x00\x00abc.", '''\
+    0: T    BINSTRING  'abc'
+    8: .    STOP
+highest protocol among opcodes = 1
+''')
+        self.check_dis(b"T\x02\x00\x00\x00\xc3\xb5.", '''\
+    0: T    BINSTRING  '\\xc3\\xb5'
+    7: .    STOP
+highest protocol among opcodes = 1
+''')
+
+    def test_short_binstring(self):
+        self.check_dis(b"U\x03abc.", '''\
+    0: U    SHORT_BINSTRING 'abc'
+    5: .    STOP
+highest protocol among opcodes = 1
+''')
+        self.check_dis(b"U\x02\xc3\xb5.", '''\
+    0: U    SHORT_BINSTRING '\\xc3\\xb5'
+    4: .    STOP
+highest protocol among opcodes = 1
+''')
+
+    def test_global(self):
+        self.check_dis(b"cmodule\nname\n.", '''\
+    0: c    GLOBAL     'module name'
+   13: .    STOP
+highest protocol among opcodes = 0
+''')
+        self.check_dis(b"cm\xc3\xb6dule\nn\xc3\xa4me\n.", '''\
+    0: c    GLOBAL     'm\xf6dule n\xe4me'
+   15: .    STOP
+highest protocol among opcodes = 0
+''')
+
+    def test_inst(self):
+        self.check_dis(b"(imodule\nname\n.", '''\
+    0: (    MARK
+    1: i        INST       'module name' (MARK at 0)
+   14: .    STOP
+highest protocol among opcodes = 0
+''')
+
+    def test_persid(self):
+        self.check_dis(b"Pabc\n.", '''\
+    0: P    PERSID     'abc'
+    5: .    STOP
+highest protocol among opcodes = 0
+''')
+
 
 class MiscTestCase(unittest.TestCase):
     def test__all__(self):
diff --git 
a/Misc/NEWS.d/next/Library/2024-11-20-16-58-59.gh-issue-126997.0PI41Y.rst 
b/Misc/NEWS.d/next/Library/2024-11-20-16-58-59.gh-issue-126997.0PI41Y.rst
new file mode 100644
index 00000000000000..b85c51ef07dcbe
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-11-20-16-58-59.gh-issue-126997.0PI41Y.rst
@@ -0,0 +1,3 @@
+Fix support of STRING and GLOBAL opcodes with non-ASCII arguments in
+:mod:`pickletools`. :func:`pickletools.dis` now outputs non-ASCII bytes in
+STRING, BINSTRING and SHORT_BINSTRING arguments as escaped (``\xXX``).

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]

Reply via email to