[Python-checkins] gh-149214: Fix non ascii characters in remote debugging (#149228)

pablogsal Sat, 02 May 2026 07:08:06 -0700

https://github.com/python/cpython/commit/efcac6f28145763aef96c3c6d3f1f8016849d165
commit: efcac6f28145763aef96c3c6d3f1f8016849d165
branch: main
author: Pablo Galindo Salgado <[email protected]>
committer: pablogsal <[email protected]>
date: 2026-05-02T15:07:28+01:00
summary:


gh-149214: Fix non ascii characters in remote debugging (#149228)

files:
A Misc/NEWS.d/next/Library/2026-05-01-10-20-27.gh-issue-149214.btP546.rst
M Include/internal/pycore_debug_offsets.h
M Lib/test/test_external_inspection.py
M Modules/_remote_debugging/debug_offsets_validation.h
M Modules/_remote_debugging/object_reading.c

diff --git a/Include/internal/pycore_debug_offsets.h 
b/Include/internal/pycore_debug_offsets.h
index c166f963da4f66..1dd10f8d94cfd8 100644
--- a/Include/internal/pycore_debug_offsets.h
+++ b/Include/internal/pycore_debug_offsets.h
@@ -215,6 +215,7 @@ typedef struct _Py_DebugOffsets {
         uint64_t state;
         uint64_t length;
         uint64_t asciiobject_size;
+        uint64_t compactunicodeobject_size;
     } unicode_object;
 
     // GC runtime state offset;
@@ -370,6 +371,7 @@ typedef struct _Py_DebugOffsets {
         .state = offsetof(PyUnicodeObject, _base._base.state), \
         .length = offsetof(PyUnicodeObject, _base._base.length), \
         .asciiobject_size = sizeof(PyASCIIObject), \
+        .compactunicodeobject_size = sizeof(PyCompactUnicodeObject), \
     }, \
     .gc = { \
         .size = sizeof(struct _gc_runtime_state), \
diff --git a/Lib/test/test_external_inspection.py 
b/Lib/test/test_external_inspection.py
index ec7192b1b89184..401136de8de666 100644
--- a/Lib/test/test_external_inspection.py
+++ b/Lib/test/test_external_inspection.py
@@ -559,6 +559,75 @@ def test_self_trace_after_ctypes_import(self):
             f"stdout: {result.stdout}\nstderr: {result.stderr}"
         )
 
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_remote_stack_trace_non_ascii_names(self):
+        # Exercise each PyUnicode kind (1-byte non-ASCII, 2-byte BMP,
+        # 4-byte non-BMP) for both the filename and the function name
+        # reported in the stack trace.
+        latin1 = "zażółć"          # 1-byte non-ASCII (forces non-ASCII path)
+        bmp = "λάμβδα"             # 2-byte BMP
+        astral = "𐌀𐌁𐌂𐌃"            # 4-byte non-BMP (Old Italic; XID, no NFKC 
fold)
+        func_name = f"{latin1}_{bmp}_{astral}"
+        script_basename = f"mod_{latin1}_{bmp}_{astral}"
+
+        port = find_unused_port()
+        script = textwrap.dedent(
+            f"""\
+            import socket
+            import time
+
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.connect(('localhost', {port}))
+
+            def {func_name}():
+                sock.sendall(b"ready")
+                time.sleep(10_000)
+
+            {func_name}()
+            """
+        )
+        with os_helper.temp_dir() as work_dir:
+            script_dir = os.path.join(work_dir, "script_pkg")
+            os.mkdir(script_dir)
+            script_name = _make_test_script(script_dir, script_basename, 
script)
+
+            server_socket = _create_server_socket(port)
+            client_socket = None
+            try:
+                p = subprocess.Popen([sys.executable, script_name])
+                client_socket, _ = server_socket.accept()
+                server_socket.close()
+                _wait_for_signal(client_socket, b"ready")
+
+                stack_trace = get_stack_trace(p.pid)
+            except PermissionError:
+                self.skipTest("Insufficient permissions to read the stack 
trace")
+            finally:
+                if client_socket is not None:
+                    client_socket.close()
+                p.kill()
+                p.wait(timeout=SHORT_TIMEOUT)
+
+            frames = [
+                frame
+                for interp in stack_trace
+                for thread in interp.threads
+                for frame in thread.frame_info
+            ]
+            target = next(
+                (f for f in frames if f.funcname == func_name), None
+            )
+            self.assertIsNotNone(
+                target,
+                f"Frame for {func_name!r} missing; got "
+                f"{[(f.filename, f.funcname) for f in frames]}",
+            )
+            self.assertEqual(target.filename, script_name)
+
     @skip_if_not_supported
     @unittest.skipIf(
         sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
diff --git 
a/Misc/NEWS.d/next/Library/2026-05-01-10-20-27.gh-issue-149214.btP546.rst 
b/Misc/NEWS.d/next/Library/2026-05-01-10-20-27.gh-issue-149214.btP546.rst
new file mode 100644
index 00000000000000..cbb05620626d0f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-05-01-10-20-27.gh-issue-149214.btP546.rst
@@ -0,0 +1,4 @@
+Fix :mod:`!_remote_debugging` misreading non-ASCII Unicode strings (Latin-1,
+BMP and non-BMP) from a remote process. Filenames and function names that
+contain non-ASCII characters are now reported correctly in stack traces, the
+sampling profiler, and :mod:`asyncio` task introspection.
diff --git a/Modules/_remote_debugging/debug_offsets_validation.h 
b/Modules/_remote_debugging/debug_offsets_validation.h
index 32800e767b3169..53cd1adeb07d5b 100644
--- a/Modules/_remote_debugging/debug_offsets_validation.h
+++ b/Modules/_remote_debugging/debug_offsets_validation.h
@@ -31,7 +31,7 @@
 #define FIELD_SIZE(type, member) sizeof(((type *)0)->member)
 
 enum {
-    PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 840,
+    PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 848,
     PY_REMOTE_ASYNC_DEBUG_OFFSETS_TOTAL_SIZE = 104,
 };
 
@@ -304,7 +304,9 @@ validate_fixed_field(
 
 #define PY_REMOTE_DEBUG_UNICODE_OBJECT_FIELDS(APPLY, buffer_size) \
     APPLY(unicode_object, length, sizeof(Py_ssize_t), _Alignof(Py_ssize_t), 
buffer_size); \
-    APPLY(unicode_object, asciiobject_size, sizeof(char), _Alignof(char), 
buffer_size)
+    APPLY(unicode_object, state, sizeof(struct _PyUnicodeObject_state), 
_Alignof(struct _PyUnicodeObject_state), buffer_size); \
+    APPLY(unicode_object, asciiobject_size, sizeof(char), _Alignof(char), 
buffer_size); \
+    APPLY(unicode_object, compactunicodeobject_size, sizeof(char), 
_Alignof(char), buffer_size)
 
 #define PY_REMOTE_DEBUG_GEN_OBJECT_FIELDS(APPLY, buffer_size) \
     APPLY(gen_object, gi_frame_state, sizeof(int8_t), _Alignof(int8_t), 
buffer_size); \
diff --git a/Modules/_remote_debugging/object_reading.c 
b/Modules/_remote_debugging/object_reading.c
index 59c28e223c545f..b63b103a2617ac 100644
--- a/Modules/_remote_debugging/object_reading.c
+++ b/Modules/_remote_debugging/object_reading.c
@@ -48,10 +48,8 @@ read_py_str(
     uintptr_t address,
     Py_ssize_t max_len
 ) {
-    PyObject *result = NULL;
-    char *buf = NULL;
-
-    // Read the entire PyUnicodeObject at once
+    // Read the entire PyUnicodeObject at once; for short strings the data
+    // is inline right after the header and we'll already have (some of) it.
     char unicode_obj[SIZEOF_UNICODE_OBJ];
     int res = _Py_RemoteDebug_PagedReadRemoteMemory(
         &unwinder->handle,
@@ -61,7 +59,7 @@ read_py_str(
     );
     if (res < 0) {
         set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read 
PyUnicodeObject");
-        goto err;
+        return NULL;
     }
 
     Py_ssize_t len = GET_MEMBER(Py_ssize_t, unicode_obj, 
unwinder->debug_offsets.unicode_object.length);
@@ -72,36 +70,94 @@ read_py_str(
         return NULL;
     }
 
-    buf = (char *)PyMem_RawMalloc(len+1);
-    if (buf == NULL) {
-        PyErr_NoMemory();
-        set_exception_cause(unwinder, PyExc_MemoryError, "Failed to allocate 
buffer for string reading");
+    // Inspect state to pick the right data offset and character width.
+    // We rely on the remote process sharing this Python version's
+    // PyASCIIObject layout, the same assumption already used for `length`.
+    struct _PyUnicodeObject_state state = GET_MEMBER(
+        struct _PyUnicodeObject_state,
+        unicode_obj,
+        unwinder->debug_offsets.unicode_object.state);
+
+    if (!state.compact) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Cannot read non-compact Unicode object at 0x%lx", 
address);
+        set_exception_cause(unwinder, PyExc_RuntimeError,
+                            "Legacy (non-compact) Unicode objects are not 
supported");
         return NULL;
     }
 
-    size_t offset = 
(size_t)unwinder->debug_offsets.unicode_object.asciiobject_size;
-    res = _Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, address + 
offset, len, buf);
-    if (res < 0) {
-        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read 
string data from remote memory");
-        goto err;
+    int kind = (int)state.kind;
+    Py_UCS4 max_char;
+    switch (kind) {
+        case PyUnicode_1BYTE_KIND:
+            max_char = state.ascii ? 0x7F : 0xFF;
+            break;
+        case PyUnicode_2BYTE_KIND:
+            max_char = 0xFFFF;
+            break;
+        case PyUnicode_4BYTE_KIND:
+            max_char = 0x10FFFF;
+            break;
+        default:
+            PyErr_Format(PyExc_RuntimeError,
+                         "Invalid Unicode kind %d at 0x%lx", kind, address);
+            set_exception_cause(unwinder, PyExc_RuntimeError,
+                                "Invalid kind in remote Unicode object");
+            return NULL;
     }
-    buf[len] = '\0';
 
-    result = PyUnicode_FromStringAndSize(buf, len);
+    size_t header_size = state.ascii
+        ? (size_t)unwinder->debug_offsets.unicode_object.asciiobject_size
+        : 
(size_t)unwinder->debug_offsets.unicode_object.compactunicodeobject_size;
+
+    // len * kind is bounded by max_len * 4 (kind <= 4, len <= max_len), so
+    // the multiplication can't overflow for any caller-sane max_len, but the
+    // explicit cap here keeps a corrupted remote `length` from later turning
+    // into a giant allocation.
+    size_t nbytes = (size_t)len * (size_t)kind;
+    if ((size_t)len > (SIZE_MAX / 4) || nbytes > (size_t)max_len * 4) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Implausible Unicode byte size %zu at 0x%lx", nbytes, 
address);
+        set_exception_cause(unwinder, PyExc_RuntimeError,
+                            "Garbage byte size in remote Unicode object");
+        return NULL;
+    }
+
+    PyObject *result = PyUnicode_New(len, max_char);
     if (result == NULL) {
-        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create 
PyUnicode from remote string data");
-        goto err;
+        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to allocate 
PyUnicode for remote string");
+        return NULL;
+    }
+    if (nbytes == 0) {
+        return result;
     }
 
-    PyMem_RawFree(buf);
-    assert(result != NULL);
-    return result;
+    void *data = PyUnicode_DATA(result);
 
-err:
-    if (buf != NULL) {
-        PyMem_RawFree(buf);
+    // Reuse data already present in the header read; only round-trip for
+    // whatever spills past it.
+    size_t inline_avail = (header_size < SIZEOF_UNICODE_OBJ)
+        ? SIZEOF_UNICODE_OBJ - header_size
+        : 0;
+    size_t inline_bytes = nbytes < inline_avail ? nbytes : inline_avail;
+    if (inline_bytes > 0) {
+        memcpy(data, unicode_obj + header_size, inline_bytes);
     }
-    return NULL;
+
+    if (nbytes > inline_bytes) {
+        res = _Py_RemoteDebug_PagedReadRemoteMemory(
+            &unwinder->handle,
+            address + header_size + inline_bytes,
+            nbytes - inline_bytes,
+            (char *)data + inline_bytes);
+        if (res < 0) {
+            Py_DECREF(result);
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read 
string data from remote memory");
+            return NULL;
+        }
+    }
+
+    return result;
 }
 
 PyObject *

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]

[Python-checkins] gh-149214: Fix non ascii characters in remote debugging (#149228)

Reply via email to