https://github.com/python/cpython/commit/efcac6f28145763aef96c3c6d3f1f8016849d165
commit: efcac6f28145763aef96c3c6d3f1f8016849d165
branch: main
author: Pablo Galindo Salgado <[email protected]>
committer: pablogsal <[email protected]>
date: 2026-05-02T15:07:28+01:00
summary:
gh-149214: Fix non ascii characters in remote debugging (#149228)
files:
A Misc/NEWS.d/next/Library/2026-05-01-10-20-27.gh-issue-149214.btP546.rst
M Include/internal/pycore_debug_offsets.h
M Lib/test/test_external_inspection.py
M Modules/_remote_debugging/debug_offsets_validation.h
M Modules/_remote_debugging/object_reading.c
diff --git a/Include/internal/pycore_debug_offsets.h
b/Include/internal/pycore_debug_offsets.h
index c166f963da4f66..1dd10f8d94cfd8 100644
--- a/Include/internal/pycore_debug_offsets.h
+++ b/Include/internal/pycore_debug_offsets.h
@@ -215,6 +215,7 @@ typedef struct _Py_DebugOffsets {
uint64_t state;
uint64_t length;
uint64_t asciiobject_size;
+ uint64_t compactunicodeobject_size;
} unicode_object;
// GC runtime state offset;
@@ -370,6 +371,7 @@ typedef struct _Py_DebugOffsets {
.state = offsetof(PyUnicodeObject, _base._base.state), \
.length = offsetof(PyUnicodeObject, _base._base.length), \
.asciiobject_size = sizeof(PyASCIIObject), \
+ .compactunicodeobject_size = sizeof(PyCompactUnicodeObject), \
}, \
.gc = { \
.size = sizeof(struct _gc_runtime_state), \
diff --git a/Lib/test/test_external_inspection.py
b/Lib/test/test_external_inspection.py
index ec7192b1b89184..401136de8de666 100644
--- a/Lib/test/test_external_inspection.py
+++ b/Lib/test/test_external_inspection.py
@@ -559,6 +559,75 @@ def test_self_trace_after_ctypes_import(self):
f"stdout: {result.stdout}\nstderr: {result.stderr}"
)
+ @skip_if_not_supported
+ @unittest.skipIf(
+ sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+ "Test only runs on Linux with process_vm_readv support",
+ )
+ def test_remote_stack_trace_non_ascii_names(self):
+ # Exercise each PyUnicode kind (1-byte non-ASCII, 2-byte BMP,
+ # 4-byte non-BMP) for both the filename and the function name
+ # reported in the stack trace.
+ latin1 = "zażółć" # 1-byte non-ASCII (forces non-ASCII path)
+ bmp = "λάμβδα" # 2-byte BMP
+ astral = "𐌀𐌁𐌂𐌃" # 4-byte non-BMP (Old Italic; XID, no NFKC
fold)
+ func_name = f"{latin1}_{bmp}_{astral}"
+ script_basename = f"mod_{latin1}_{bmp}_{astral}"
+
+ port = find_unused_port()
+ script = textwrap.dedent(
+ f"""\
+ import socket
+ import time
+
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ sock.connect(('localhost', {port}))
+
+ def {func_name}():
+ sock.sendall(b"ready")
+ time.sleep(10_000)
+
+ {func_name}()
+ """
+ )
+ with os_helper.temp_dir() as work_dir:
+ script_dir = os.path.join(work_dir, "script_pkg")
+ os.mkdir(script_dir)
+ script_name = _make_test_script(script_dir, script_basename,
script)
+
+ server_socket = _create_server_socket(port)
+ client_socket = None
+ try:
+ p = subprocess.Popen([sys.executable, script_name])
+ client_socket, _ = server_socket.accept()
+ server_socket.close()
+ _wait_for_signal(client_socket, b"ready")
+
+ stack_trace = get_stack_trace(p.pid)
+ except PermissionError:
+ self.skipTest("Insufficient permissions to read the stack
trace")
+ finally:
+ if client_socket is not None:
+ client_socket.close()
+ p.kill()
+ p.wait(timeout=SHORT_TIMEOUT)
+
+ frames = [
+ frame
+ for interp in stack_trace
+ for thread in interp.threads
+ for frame in thread.frame_info
+ ]
+ target = next(
+ (f for f in frames if f.funcname == func_name), None
+ )
+ self.assertIsNotNone(
+ target,
+ f"Frame for {func_name!r} missing; got "
+ f"{[(f.filename, f.funcname) for f in frames]}",
+ )
+ self.assertEqual(target.filename, script_name)
+
@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
diff --git
a/Misc/NEWS.d/next/Library/2026-05-01-10-20-27.gh-issue-149214.btP546.rst
b/Misc/NEWS.d/next/Library/2026-05-01-10-20-27.gh-issue-149214.btP546.rst
new file mode 100644
index 00000000000000..cbb05620626d0f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-05-01-10-20-27.gh-issue-149214.btP546.rst
@@ -0,0 +1,4 @@
+Fix :mod:`!_remote_debugging` misreading non-ASCII Unicode strings (Latin-1,
+BMP and non-BMP) from a remote process. Filenames and function names that
+contain non-ASCII characters are now reported correctly in stack traces, the
+sampling profiler, and :mod:`asyncio` task introspection.
diff --git a/Modules/_remote_debugging/debug_offsets_validation.h
b/Modules/_remote_debugging/debug_offsets_validation.h
index 32800e767b3169..53cd1adeb07d5b 100644
--- a/Modules/_remote_debugging/debug_offsets_validation.h
+++ b/Modules/_remote_debugging/debug_offsets_validation.h
@@ -31,7 +31,7 @@
#define FIELD_SIZE(type, member) sizeof(((type *)0)->member)
enum {
- PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 840,
+ PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 848,
PY_REMOTE_ASYNC_DEBUG_OFFSETS_TOTAL_SIZE = 104,
};
@@ -304,7 +304,9 @@ validate_fixed_field(
#define PY_REMOTE_DEBUG_UNICODE_OBJECT_FIELDS(APPLY, buffer_size) \
APPLY(unicode_object, length, sizeof(Py_ssize_t), _Alignof(Py_ssize_t),
buffer_size); \
- APPLY(unicode_object, asciiobject_size, sizeof(char), _Alignof(char),
buffer_size)
+ APPLY(unicode_object, state, sizeof(struct _PyUnicodeObject_state),
_Alignof(struct _PyUnicodeObject_state), buffer_size); \
+ APPLY(unicode_object, asciiobject_size, sizeof(char), _Alignof(char),
buffer_size); \
+ APPLY(unicode_object, compactunicodeobject_size, sizeof(char),
_Alignof(char), buffer_size)
#define PY_REMOTE_DEBUG_GEN_OBJECT_FIELDS(APPLY, buffer_size) \
APPLY(gen_object, gi_frame_state, sizeof(int8_t), _Alignof(int8_t),
buffer_size); \
diff --git a/Modules/_remote_debugging/object_reading.c
b/Modules/_remote_debugging/object_reading.c
index 59c28e223c545f..b63b103a2617ac 100644
--- a/Modules/_remote_debugging/object_reading.c
+++ b/Modules/_remote_debugging/object_reading.c
@@ -48,10 +48,8 @@ read_py_str(
uintptr_t address,
Py_ssize_t max_len
) {
- PyObject *result = NULL;
- char *buf = NULL;
-
- // Read the entire PyUnicodeObject at once
+ // Read the entire PyUnicodeObject at once; for short strings the data
+ // is inline right after the header and we'll already have (some of) it.
char unicode_obj[SIZEOF_UNICODE_OBJ];
int res = _Py_RemoteDebug_PagedReadRemoteMemory(
&unwinder->handle,
@@ -61,7 +59,7 @@ read_py_str(
);
if (res < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read
PyUnicodeObject");
- goto err;
+ return NULL;
}
Py_ssize_t len = GET_MEMBER(Py_ssize_t, unicode_obj,
unwinder->debug_offsets.unicode_object.length);
@@ -72,36 +70,94 @@ read_py_str(
return NULL;
}
- buf = (char *)PyMem_RawMalloc(len+1);
- if (buf == NULL) {
- PyErr_NoMemory();
- set_exception_cause(unwinder, PyExc_MemoryError, "Failed to allocate
buffer for string reading");
+ // Inspect state to pick the right data offset and character width.
+ // We rely on the remote process sharing this Python version's
+ // PyASCIIObject layout, the same assumption already used for `length`.
+ struct _PyUnicodeObject_state state = GET_MEMBER(
+ struct _PyUnicodeObject_state,
+ unicode_obj,
+ unwinder->debug_offsets.unicode_object.state);
+
+ if (!state.compact) {
+ PyErr_Format(PyExc_RuntimeError,
+ "Cannot read non-compact Unicode object at 0x%lx",
address);
+ set_exception_cause(unwinder, PyExc_RuntimeError,
+ "Legacy (non-compact) Unicode objects are not
supported");
return NULL;
}
- size_t offset =
(size_t)unwinder->debug_offsets.unicode_object.asciiobject_size;
- res = _Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, address +
offset, len, buf);
- if (res < 0) {
- set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read
string data from remote memory");
- goto err;
+ int kind = (int)state.kind;
+ Py_UCS4 max_char;
+ switch (kind) {
+ case PyUnicode_1BYTE_KIND:
+ max_char = state.ascii ? 0x7F : 0xFF;
+ break;
+ case PyUnicode_2BYTE_KIND:
+ max_char = 0xFFFF;
+ break;
+ case PyUnicode_4BYTE_KIND:
+ max_char = 0x10FFFF;
+ break;
+ default:
+ PyErr_Format(PyExc_RuntimeError,
+ "Invalid Unicode kind %d at 0x%lx", kind, address);
+ set_exception_cause(unwinder, PyExc_RuntimeError,
+ "Invalid kind in remote Unicode object");
+ return NULL;
}
- buf[len] = '\0';
- result = PyUnicode_FromStringAndSize(buf, len);
+ size_t header_size = state.ascii
+ ? (size_t)unwinder->debug_offsets.unicode_object.asciiobject_size
+ :
(size_t)unwinder->debug_offsets.unicode_object.compactunicodeobject_size;
+
+ // len * kind is bounded by max_len * 4 (kind <= 4, len <= max_len), so
+ // the multiplication can't overflow for any caller-sane max_len, but the
+ // explicit cap here keeps a corrupted remote `length` from later turning
+ // into a giant allocation.
+ size_t nbytes = (size_t)len * (size_t)kind;
+ if ((size_t)len > (SIZE_MAX / 4) || nbytes > (size_t)max_len * 4) {
+ PyErr_Format(PyExc_RuntimeError,
+ "Implausible Unicode byte size %zu at 0x%lx", nbytes,
address);
+ set_exception_cause(unwinder, PyExc_RuntimeError,
+ "Garbage byte size in remote Unicode object");
+ return NULL;
+ }
+
+ PyObject *result = PyUnicode_New(len, max_char);
if (result == NULL) {
- set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create
PyUnicode from remote string data");
- goto err;
+ set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to allocate
PyUnicode for remote string");
+ return NULL;
+ }
+ if (nbytes == 0) {
+ return result;
}
- PyMem_RawFree(buf);
- assert(result != NULL);
- return result;
+ void *data = PyUnicode_DATA(result);
-err:
- if (buf != NULL) {
- PyMem_RawFree(buf);
+ // Reuse data already present in the header read; only round-trip for
+ // whatever spills past it.
+ size_t inline_avail = (header_size < SIZEOF_UNICODE_OBJ)
+ ? SIZEOF_UNICODE_OBJ - header_size
+ : 0;
+ size_t inline_bytes = nbytes < inline_avail ? nbytes : inline_avail;
+ if (inline_bytes > 0) {
+ memcpy(data, unicode_obj + header_size, inline_bytes);
}
- return NULL;
+
+ if (nbytes > inline_bytes) {
+ res = _Py_RemoteDebug_PagedReadRemoteMemory(
+ &unwinder->handle,
+ address + header_size + inline_bytes,
+ nbytes - inline_bytes,
+ (char *)data + inline_bytes);
+ if (res < 0) {
+ Py_DECREF(result);
+ set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read
string data from remote memory");
+ return NULL;
+ }
+ }
+
+ return result;
}
PyObject *
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]