https://github.com/python/cpython/commit/034c536d56aa89350dcdd29bf14bc54042abca04
commit: 034c536d56aa89350dcdd29bf14bc54042abca04
branch: 3.15
author: Miss Islington (bot) <[email protected]>
committer: pablogsal <[email protected]>
date: 2026-05-20T11:59:10Z
summary:

[3.15] gh-149584: Fix excessive overhead in the Tachyon profiler regarding the 
cache behavior (GH-149649) (#150152)

files:
A Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst
M Lib/profiling/sampling/sample.py
M Lib/test/test_external_inspection.py
M Modules/_remote_debugging/_remote_debugging.h
M Modules/_remote_debugging/clinic/module.c.h
M Modules/_remote_debugging/code_objects.c
M Modules/_remote_debugging/frame_cache.c
M Modules/_remote_debugging/frames.c
M Modules/_remote_debugging/module.c
M Modules/_remote_debugging/threads.c
M Python/remote_debug.h
M Tools/inspection/benchmark_external_inspection.py

diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py
index b9e7e2625d09e4..2d379e1e16a35e 100644
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@@ -327,6 +327,33 @@ def _print_unwinder_stats(self):
         print(f"    Hits:             {code_hits:n} 
({ANSIColors.GREEN}{fmt(code_hits_pct)}%{ANSIColors.RESET})")
         print(f"    Misses:           {code_misses:n} 
({ANSIColors.RED}{fmt(code_misses_pct)}%{ANSIColors.RESET})")
 
+        batched_attempts = stats.get('batched_read_attempts', 0)
+        batched_successes = stats.get('batched_read_successes', 0)
+        batched_misses = stats.get('batched_read_misses', 0)
+        segments_requested = stats.get('batched_read_segments_requested', 0)
+        segments_completed = stats.get('batched_read_segments_completed', 0)
+        if batched_attempts > 0:
+            batched_success_rate = stats.get('batched_read_success_rate', 0.0)
+            batched_miss_rate = 100.0 - batched_success_rate
+            segment_completion_rate = stats.get(
+                'batched_read_segment_completion_rate', 0.0
+            )
+
+            print(f"  {ANSIColors.CYAN}Batched Reads:{ANSIColors.RESET}")
+            print(f"    Attempts:         {batched_attempts:n}")
+            print(
+                f"    Successes:        {batched_successes:n} "
+                
f"({ANSIColors.GREEN}{fmt(batched_success_rate)}%{ANSIColors.RESET})"
+            )
+            print(
+                f"    Misses:           {batched_misses:n} "
+                
f"({ANSIColors.RED}{fmt(batched_miss_rate)}%{ANSIColors.RESET})"
+            )
+            print(
+                f"    Segments read:    
{segments_completed:n}/{segments_requested:n} "
+                
f"({ANSIColors.GREEN}{fmt(segment_completion_rate)}%{ANSIColors.RESET})"
+            )
+
         # Memory operations
         memory_reads = stats.get('memory_reads', 0)
         memory_bytes = stats.get('memory_bytes_read', 0)
diff --git a/Lib/test/test_external_inspection.py 
b/Lib/test/test_external_inspection.py
index a29e6cdbbf6c78..6b1529aa173f01 100644
--- a/Lib/test/test_external_inspection.py
+++ b/Lib/test/test_external_inspection.py
@@ -3767,6 +3767,13 @@ def test_get_stats(self):
             "frames_read_from_cache",
             "frames_read_from_memory",
             "frame_cache_hit_rate",
+            "batched_read_attempts",
+            "batched_read_successes",
+            "batched_read_misses",
+            "batched_read_segments_requested",
+            "batched_read_segments_completed",
+            "batched_read_success_rate",
+            "batched_read_segment_completion_rate",
         ]
         for key in expected_keys:
             self.assertIn(key, stats)
diff --git 
a/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst 
b/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst
new file mode 100644
index 00000000000000..6734250fdd6af3
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst
@@ -0,0 +1,4 @@
+Fix excessive overhead in the Tachyon profiler when inspecting a remote
+process by avoiding repeated remote page-cache scans, batching predicted
+remote reads, and reusing cached profiler result objects. Patch by Pablo
+Galindo and Maurycy Pawłowski-Wieroński.
diff --git a/Modules/_remote_debugging/_remote_debugging.h 
b/Modules/_remote_debugging/_remote_debugging.h
index 7369cd1514c296..d91ce54a18c813 100644
--- a/Modules/_remote_debugging/_remote_debugging.h
+++ b/Modules/_remote_debugging/_remote_debugging.h
@@ -30,6 +30,7 @@ extern "C" {
 #include "internal/pycore_llist.h"          // struct llist_node
 #include "internal/pycore_long.h"           // _PyLong_GetZero
 #include "internal/pycore_pyerrors.h"       // _PyErr_FormatFromCause
+#include "internal/pycore_pyhash.h"        // _Py_HashPointerRaw
 #include "internal/pycore_stackref.h"       // Py_TAG_BITS
 #include "../../Python/remote_debug.h"
 
@@ -215,6 +216,8 @@ typedef struct {
     PyObject *file_name;
     int first_lineno;
     PyObject *linetable;  // bytes
+    PyObject *last_frame_info;
+    ptrdiff_t last_addrq;
     uintptr_t addr_code_adaptive;
 } CachedCodeMetadata;
 
@@ -224,11 +227,41 @@ typedef struct {
 
 typedef struct {
     uint64_t thread_id;                      // 0 = empty slot
+    uintptr_t thread_state_addr;
     uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
     Py_ssize_t num_addrs;
+    PyObject *thread_id_obj;                 // owned reference, NULL if empty
     PyObject *frame_list;                    // owned reference, NULL if empty
 } FrameCacheEntry;
 
+#define INTERPRETER_THREAD_CACHE_SIZE 32
+#if (INTERPRETER_THREAD_CACHE_SIZE & (INTERPRETER_THREAD_CACHE_SIZE - 1)) != 0
+#  error "INTERPRETER_THREAD_CACHE_SIZE must be a power of two"
+#endif
+
+// The two per-interpreter L2 caches below are split into per-field tables so
+// that a writer rebinding one slot cannot leave stale data in a field owned by
+// the other when the slot is reused across interpreters.
+typedef struct {
+    uintptr_t interpreter_addr;
+    uintptr_t thread_state_addr;
+} InterpreterTstateCacheEntry;
+typedef struct {
+    uintptr_t interpreter_addr;
+    uint64_t code_object_generation;
+} InterpreterGenerationCacheEntry;
+
+// Carries already-read thread state and/or frame buffers across helpers so the
+// downstream callee can skip a remote read. Address fields are caller-supplied
+// inputs; buffer pointers (tstate, frame) are NULL unless a prior batched read
+// successfully populated them.
+typedef struct {
+    const char *tstate;
+    uintptr_t tstate_addr;
+    const char *frame;
+    uintptr_t frame_addr;
+} RemoteReadPrefetch;
+
 /* Statistics for profiling performance analysis */
 typedef struct {
     uint64_t total_samples;                  // Total number of 
get_stack_trace calls
@@ -242,14 +275,44 @@ typedef struct {
     uint64_t code_object_cache_hits;         // Code object cache hits
     uint64_t code_object_cache_misses;       // Code object cache misses
     uint64_t stale_cache_invalidations;      // Times stale entries were 
cleared
+    uint64_t batched_read_attempts;          // Batched remote-read attempts
+    uint64_t batched_read_successes;         // Attempts that read all 
requested segments
+    uint64_t batched_read_misses;            // Attempts that fell back or 
partially read
+    uint64_t batched_read_segments_requested; // Segments requested by batched 
reads
+    uint64_t batched_read_segments_completed; // Segments completed by batched 
reads
 } UnwinderStats;
 
+#if defined(__GNUC__) || defined(__clang__)
+#  define REMOTE_DEBUG_UNLIKELY(value) __builtin_expect(!!(value), 0)
+#else
+#  define REMOTE_DEBUG_UNLIKELY(value) (value)
+#endif
+
 /* Stats tracking macros - no-op when stats collection is disabled */
 #define STATS_INC(unwinder, field) \
-    do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0)
+    do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) 
(unwinder)->stats.field++; } while(0)
 
 #define STATS_ADD(unwinder, field, val) \
-    do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } 
while(0)
+    do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) 
(unwinder)->stats.field += (val); } while(0)
+
+#if HAVE_PROCESS_VM_READV
+#  define STATS_BATCHED_READ(unwinder, requested, completed) \
+    do { \
+        if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) { \
+            (unwinder)->stats.batched_read_attempts++; \
+            (unwinder)->stats.batched_read_segments_requested += 
(uint64_t)(requested); \
+            (unwinder)->stats.batched_read_segments_completed += 
(uint64_t)(completed); \
+            if ((completed) == (requested)) { \
+                (unwinder)->stats.batched_read_successes++; \
+            } \
+            else { \
+                (unwinder)->stats.batched_read_misses++; \
+            } \
+        } \
+    } while(0)
+#else
+#  define STATS_BATCHED_READ(unwinder, requested, completed) ((void)0)
+#endif
 
 typedef struct {
     PyTypeObject *RemoteDebugging_Type;
@@ -290,7 +353,6 @@ typedef struct {
     struct _Py_AsyncioModuleDebugOffsets async_debug_offsets;
     uintptr_t interpreter_addr;
     uintptr_t tstate_addr;
-    uint64_t code_object_generation;
     _Py_hashtable_t *code_object_cache;
     int debug;
     int only_active_thread;
@@ -302,9 +364,17 @@ typedef struct {
     int cache_frames;
     int collect_stats;  // whether to collect statistics
     uint32_t stale_invalidation_counter;  // counter for throttling 
frame_cache_invalidate_stale
+    // L1 single-entry shortcut over cached_tstates[]: most workloads sample 
one
+    // interpreter, so check these pairs before hashing into the table below.
+    uintptr_t cached_tstate_interpreter_addr;
+    uintptr_t cached_tstate_addr;
+    uintptr_t cached_generation_interpreter_addr;
+    uint64_t cached_code_object_generation;
     RemoteDebuggingState *cached_state;
     FrameCacheEntry *frame_cache;  // preallocated array of 
FRAME_CACHE_MAX_THREADS entries
     UnwinderStats stats;  // statistics for performance analysis
+    InterpreterTstateCacheEntry cached_tstates[INTERPRETER_THREAD_CACHE_SIZE];
+    InterpreterGenerationCacheEntry 
cached_generations[INTERPRETER_THREAD_CACHE_SIZE];
 #ifdef Py_GIL_DISABLED
     uint32_t tlbc_generation;
     _Py_hashtable_t *tlbc_cache;
@@ -361,11 +431,13 @@ typedef struct {
 typedef struct {
     /* Inputs */
     uintptr_t frame_addr;           // Starting frame address
+    uintptr_t thread_state_addr;    // Owning thread state address
     uintptr_t base_frame_addr;      // Sentinel at bottom (for validation)
     uintptr_t gc_frame;             // GC frame address (0 if not tracking)
     uintptr_t last_profiled_frame;  // Last cached frame (0 if no cache)
     StackChunkList *chunks;         // Pre-copied stack chunks
     int skip_first_frame;           // Skip frame_addr itself (continue from 
its caller)
+    RemoteReadPrefetch prefetch;     // Optional already-read thread/frame 
buffers
 
     /* Outputs */
     PyObject *frame_info;           // List to append FrameInfo objects
@@ -548,6 +620,7 @@ extern int process_frame_chain(
 extern int frame_cache_init(RemoteUnwinderObject *unwinder);
 extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
 extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, 
uint64_t thread_id);
+extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject 
*unwinder, uintptr_t tstate_addr);
 extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
 extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, 
PyObject *result);
 extern int frame_cache_lookup_and_extend(
@@ -566,6 +639,7 @@ extern int frame_cache_store(
     PyObject *frame_list,
     const uintptr_t *addrs,
     Py_ssize_t num_addrs,
+    uintptr_t thread_state_addr,
     uintptr_t base_frame_addr,
     uintptr_t last_frame_visited);
 
@@ -605,7 +679,8 @@ extern PyObject* unwind_stack_for_thread(
     uintptr_t *current_tstate,
     uintptr_t gil_holder_tstate,
     uintptr_t gc_frame,
-    uintptr_t main_thread_tstate
+    uintptr_t main_thread_tstate,
+    const RemoteReadPrefetch *prefetch
 );
 
 /* Thread stopping functions (for blocking mode) */
diff --git a/Modules/_remote_debugging/clinic/module.c.h 
b/Modules/_remote_debugging/clinic/module.c.h
index d56622fb82ab56..78b1d3e8d80962 100644
--- a/Modules/_remote_debugging/clinic/module.c.h
+++ b/Modules/_remote_debugging/clinic/module.c.h
@@ -411,8 +411,15 @@ 
PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stats__doc__,
 "        - code_object_cache_hits: Code object cache hits\n"
 "        - code_object_cache_misses: Code object cache misses\n"
 "        - stale_cache_invalidations: Times stale cache entries were cleared\n"
+"        - batched_read_attempts: Batched remote-read attempts\n"
+"        - batched_read_successes: Attempts that read all requested segments\n"
+"        - batched_read_misses: Attempts that fell back or partially read\n"
+"        - batched_read_segments_requested: Segments requested by batched 
reads\n"
+"        - batched_read_segments_completed: Segments completed by batched 
reads\n"
 "        - frame_cache_hit_rate: Percentage of samples that hit the cache\n"
 "        - code_object_cache_hit_rate: Percentage of code object lookups that 
hit cache\n"
+"        - batched_read_success_rate: Percentage of batched reads that 
completed all segments\n"
+"        - batched_read_segment_completion_rate: Percentage of requested 
segments read by batched reads\n"
 "\n"
 "Raises:\n"
 "    RuntimeError: If stats collection was not enabled (stats=False)");
@@ -1540,4 +1547,4 @@ _remote_debugging_get_gc_stats(PyObject *module, PyObject 
*const *args, Py_ssize
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=5e2a29746a0c5d65 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=884914b100e9c90c input=a9049054013a1b77]*/
diff --git a/Modules/_remote_debugging/code_objects.c 
b/Modules/_remote_debugging/code_objects.c
index 97c6ba772e88f1..3af58f2b3c379e 100644
--- a/Modules/_remote_debugging/code_objects.c
+++ b/Modules/_remote_debugging/code_objects.c
@@ -405,6 +405,8 @@ parse_code_object(RemoteUnwinderObject *unwinder,
         meta->func_name = func;
         meta->file_name = file;
         meta->linetable = linetable;
+        meta->last_frame_info = NULL;
+        meta->last_addrq = -1;
         meta->first_lineno = GET_MEMBER(int, code_object, 
unwinder->debug_offsets.code_object.firstlineno);
         meta->addr_code_adaptive = real_address + 
(uintptr_t)unwinder->debug_offsets.code_object.co_code_adaptive;
 
@@ -482,6 +484,12 @@ parse_code_object(RemoteUnwinderObject *unwinder,
     addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
 #endif
     ;  // Empty statement to avoid C23 extension warning
+
+    if (!unwinder->opcodes && meta->last_frame_info != NULL && 
meta->last_addrq == addrq) {
+        *result = Py_NewRef(meta->last_frame_info);
+        return 0;
+    }
+
     LocationInfo info = {0};
     bool ok = parse_linetable(addrq, PyBytes_AS_STRING(meta->linetable),
                               PyBytes_GET_SIZE(meta->linetable),
@@ -529,6 +537,11 @@ parse_code_object(RemoteUnwinderObject *unwinder,
         goto error;
     }
 
+    if (!unwinder->opcodes) {
+        Py_XSETREF(meta->last_frame_info, Py_NewRef(tuple));
+        meta->last_addrq = addrq;
+    }
+
     *result = tuple;
     return 0;
 
diff --git a/Modules/_remote_debugging/frame_cache.c 
b/Modules/_remote_debugging/frame_cache.c
index b6566d7cff7b54..19fc406bca9ac9 100644
--- a/Modules/_remote_debugging/frame_cache.c
+++ b/Modules/_remote_debugging/frame_cache.c
@@ -30,6 +30,7 @@ frame_cache_cleanup(RemoteUnwinderObject *unwinder)
         return;
     }
     for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
         Py_CLEAR(unwinder->frame_cache[i].frame_list);
     }
     PyMem_Free(unwinder->frame_cache);
@@ -53,6 +54,21 @@ frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t 
thread_id)
     return NULL;
 }
 
+FrameCacheEntry *
+frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t 
tstate_addr)
+{
+    if (!unwinder->frame_cache || tstate_addr == 0) {
+        return NULL;
+    }
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) {
+            assert(unwinder->frame_cache[i].num_addrs <= 
FRAME_CACHE_MAX_FRAMES);
+            return &unwinder->frame_cache[i];
+        }
+    }
+    return NULL;
+}
+
 // Allocate a cache slot for a thread
 // Returns NULL if cache is full (graceful degradation)
 static FrameCacheEntry *
@@ -127,8 +143,10 @@ frame_cache_invalidate_stale(RemoteUnwinderObject 
*unwinder, PyObject *result)
         }
         if (!found) {
             // Clear this entry
+            Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
             Py_CLEAR(unwinder->frame_cache[i].frame_list);
             unwinder->frame_cache[i].thread_id = 0;
+            unwinder->frame_cache[i].thread_state_addr = 0;
             unwinder->frame_cache[i].num_addrs = 0;
             STATS_INC(unwinder, stale_cache_invalidations);
         }
@@ -216,6 +234,7 @@ frame_cache_store(
     PyObject *frame_list,
     const uintptr_t *addrs,
     Py_ssize_t num_addrs,
+    uintptr_t thread_state_addr,
     uintptr_t base_frame_addr,
     uintptr_t last_frame_visited)
 {
@@ -257,6 +276,13 @@ frame_cache_store(
         return -1;
     }
     entry->thread_id = thread_id;
+    entry->thread_state_addr = thread_state_addr;
+    if (entry->thread_id_obj == NULL) {
+        entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
+        if (entry->thread_id_obj == NULL) {
+            return -1;
+        }
+    }
     memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
     entry->num_addrs = num_addrs;
     assert(entry->num_addrs == num_addrs);
diff --git a/Modules/_remote_debugging/frames.c 
b/Modules/_remote_debugging/frames.c
index bbdfce3f7201d9..8d8019396b3e31 100644
--- a/Modules/_remote_debugging/frames.c
+++ b/Modules/_remote_debugging/frames.c
@@ -186,30 +186,16 @@ is_frame_valid(
     return 1;
 }
 
-int
-parse_frame_object(
+static int
+parse_frame_buffer(
     RemoteUnwinderObject *unwinder,
     PyObject** result,
-    uintptr_t address,
+    const char *frame,
     uintptr_t* address_of_code_object,
     uintptr_t* previous_frame
 ) {
-    char frame[SIZEOF_INTERP_FRAME];
     *address_of_code_object = 0;
 
-    Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
-        &unwinder->handle,
-        address,
-        SIZEOF_INTERP_FRAME,
-        frame
-    );
-    if (bytes_read < 0) {
-        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read 
interpreter frame");
-        return -1;
-    }
-    STATS_INC(unwinder, memory_reads);
-    STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
-
     *previous_frame = GET_MEMBER(uintptr_t, frame, 
unwinder->debug_offsets.interpreter_frame.previous);
     uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, 
unwinder->debug_offsets.interpreter_frame.executable);
     int frame_valid = is_frame_valid(unwinder, (uintptr_t)frame, code_object);
@@ -237,6 +223,31 @@ parse_frame_object(
     return parse_code_object(unwinder, result, &code_ctx);
 }
 
+int
+parse_frame_object(
+    RemoteUnwinderObject *unwinder,
+    PyObject** result,
+    uintptr_t address,
+    uintptr_t* address_of_code_object,
+    uintptr_t* previous_frame
+) {
+    char frame[SIZEOF_INTERP_FRAME];
+    Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
+        &unwinder->handle,
+        address,
+        SIZEOF_INTERP_FRAME,
+        frame
+    );
+    if (bytes_read < 0) {
+        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read 
interpreter frame");
+        return -1;
+    }
+    STATS_INC(unwinder, memory_reads);
+    STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
+
+    return parse_frame_buffer(unwinder, result, frame, address_of_code_object, 
previous_frame);
+}
+
 int
 parse_frame_from_chunks(
     RemoteUnwinderObject *unwinder,
@@ -312,15 +323,32 @@ process_frame_chain(
         }
         assert(frame_count <= MAX_FRAMES);
 
-        if (parse_frame_from_chunks(unwinder, &frame, frame_addr, 
&next_frame_addr, &stackpointer, ctx->chunks) < 0) {
+        if (ctx->chunks && ctx->chunks->count > 0) {
+            if (parse_frame_from_chunks(unwinder, &frame, frame_addr, 
&next_frame_addr, &stackpointer, ctx->chunks) == 0) {
+                goto parsed_frame;
+            }
             PyErr_Clear();
+        }
+        {
             uintptr_t address_of_code_object = 0;
-            if (parse_frame_object(unwinder, &frame, frame_addr, 
&address_of_code_object, &next_frame_addr) < 0) {
+            int parse_result;
+            if (ctx->prefetch.frame && ctx->prefetch.frame_addr == frame_addr) 
{
+                parse_result = parse_frame_buffer(
+                    unwinder, &frame, ctx->prefetch.frame,
+                    &address_of_code_object, &next_frame_addr);
+            }
+            else {
+                parse_result = parse_frame_object(
+                    unwinder, &frame, frame_addr,
+                    &address_of_code_object, &next_frame_addr);
+            }
+            if (parse_result < 0) {
                 set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to 
parse frame object in chain");
                 return -1;
             }
         }
 
+parsed_frame:
         // Skip first frame if requested (used for cache miss continuation)
         if (ctx->skip_first_frame && frame_count == 1) {
             Py_XDECREF(frame);
@@ -501,41 +529,37 @@ try_full_cache_hit(
     PyObject *current_frame = NULL;
     uintptr_t code_object_addr = 0;
     uintptr_t previous_frame = 0;
-    int parse_result = parse_frame_object(unwinder, &current_frame, 
ctx->frame_addr,
+    int parse_result;
+    if (ctx->prefetch.frame && ctx->prefetch.frame_addr == ctx->frame_addr) {
+        parse_result = parse_frame_buffer(unwinder, &current_frame,
+                                          ctx->prefetch.frame,
                                           &code_object_addr, &previous_frame);
+    }
+    else {
+        parse_result = parse_frame_object(unwinder, &current_frame, 
ctx->frame_addr,
+                                          &code_object_addr, &previous_frame);
+    }
     if (parse_result < 0) {
         return -1;
     }
 
-    Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
-    PyObject *parent_slice = NULL;
-    if (cached_size > 1) {
-        parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size);
-        if (!parent_slice) {
-            Py_XDECREF(current_frame);
-            return -1;
-        }
-    }
-
     if (current_frame != NULL) {
         if (PyList_Append(ctx->frame_info, current_frame) < 0) {
             Py_DECREF(current_frame);
-            Py_XDECREF(parent_slice);
             return -1;
         }
         Py_DECREF(current_frame);
         STATS_ADD(unwinder, frames_read_from_memory, 1);
     }
 
-    if (parent_slice) {
-        Py_ssize_t cur_size = PyList_GET_SIZE(ctx->frame_info);
-        int result = PyList_SetSlice(ctx->frame_info, cur_size, cur_size, 
parent_slice);
-        Py_DECREF(parent_slice);
-        if (result < 0) {
+    Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
+    for (Py_ssize_t i = 1; i < cached_size; i++) {
+        PyObject *cached_frame = PyList_GET_ITEM(entry->frame_list, i);
+        if (PyList_Append(ctx->frame_info, cached_frame) < 0) {
             return -1;
         }
-        STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1);
     }
+    STATS_ADD(unwinder, frames_read_from_cache, cached_size > 1 ? cached_size 
- 1 : 0);
 
     STATS_INC(unwinder, frame_cache_hits);
     return 1;
@@ -606,7 +630,8 @@ collect_frames_with_cache(
     }
 
     if (frame_cache_store(unwinder, thread_id, ctx->frame_info, 
ctx->frame_addrs, ctx->num_addrs,
-                          ctx->base_frame_addr, ctx->last_frame_visited) < 0) {
+                          ctx->thread_state_addr, ctx->base_frame_addr,
+                          ctx->last_frame_visited) < 0) {
         return -1;
     }
 
diff --git a/Modules/_remote_debugging/module.c 
b/Modules/_remote_debugging/module.c
index efdd2e1a2d7b7a..ae2f7e7f31ba77 100644
--- a/Modules/_remote_debugging/module.c
+++ b/Modules/_remote_debugging/module.c
@@ -166,6 +166,7 @@ cached_code_metadata_destroy(void *ptr)
     Py_DECREF(meta->func_name);
     Py_DECREF(meta->file_name);
     Py_DECREF(meta->linetable);
+    Py_XDECREF(meta->last_frame_info);
     PyMem_RawFree(meta);
 }
 
@@ -360,6 +361,10 @@ 
_remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
     self->cache_frames = cache_frames;
     self->collect_stats = stats;
     self->stale_invalidation_counter = 0;
+    self->cached_tstate_interpreter_addr = 0;
+    self->cached_tstate_addr = 0;
+    memset(self->cached_tstates, 0, sizeof(self->cached_tstates));
+    memset(self->cached_generations, 0, sizeof(self->cached_generations));
     self->debug = debug;
     self->only_active_thread = only_active_thread;
     self->mode = mode;
@@ -473,6 +478,172 @@ 
_remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
     return 0;
 }
 
+static inline size_t
+interpreter_thread_cache_index(uintptr_t interpreter_addr)
+{
+    // Direct-mapped table indexed by the remote interpreter address. Each 
entry
+    // stores the full address and verifies it on lookup, so hash collisions
+    // degrade to misses and cannot return a value from the wrong interpreter.
+    return (size_t)_Py_HashPointerRaw((const void *)interpreter_addr)
+        & (INTERPRETER_THREAD_CACHE_SIZE - 1);
+}
+
+static inline uintptr_t
+get_cached_tstate_for_interpreter(
+    RemoteUnwinderObject *self,
+    uintptr_t interpreter_addr)
+{
+    if (interpreter_addr == 0) {
+        return 0;
+    }
+
+    if (self->cached_tstate_interpreter_addr == interpreter_addr) {
+        return self->cached_tstate_addr;
+    }
+
+    InterpreterTstateCacheEntry *entry =
+        
&self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)];
+    if (entry->interpreter_addr == interpreter_addr) {
+        self->cached_tstate_interpreter_addr = interpreter_addr;
+        self->cached_tstate_addr = entry->thread_state_addr;
+        return entry->thread_state_addr;
+    }
+    return 0;
+}
+
+static inline void
+set_cached_tstate_for_interpreter(
+    RemoteUnwinderObject *self,
+    uintptr_t interpreter_addr,
+    uintptr_t thread_state_addr)
+{
+    if (interpreter_addr == 0 || thread_state_addr == 0) {
+        return;
+    }
+
+    self->cached_tstate_interpreter_addr = interpreter_addr;
+    self->cached_tstate_addr = thread_state_addr;
+
+    InterpreterTstateCacheEntry *entry =
+        
&self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)];
+    entry->interpreter_addr = interpreter_addr;
+    entry->thread_state_addr = thread_state_addr;
+}
+
+static void
+refresh_generation_caches_from_interp_state(
+    RemoteUnwinderObject *self,
+    uintptr_t interpreter_addr,
+    const char *interp_state_buffer)
+{
+    uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer,
+            self->debug_offsets.interpreter_state.code_object_generation);
+
+    if (self->cached_generation_interpreter_addr == interpreter_addr) {
+        if (code_object_generation != self->cached_code_object_generation) {
+            self->cached_code_object_generation = code_object_generation;
+            _Py_hashtable_clear(self->code_object_cache);
+        }
+    }
+    else {
+        InterpreterGenerationCacheEntry *entry =
+            
&self->cached_generations[interpreter_thread_cache_index(interpreter_addr)];
+        // A slot rebound from another interpreter must be treated as changed:
+        // the code_object_cache is global, so even if the new generation
+        // numerically matches what the previous occupant had, stale entries
+        // from that occupant could still be served.
+        int changed = entry->interpreter_addr != interpreter_addr
+                   || entry->code_object_generation != code_object_generation;
+        entry->interpreter_addr = interpreter_addr;
+        entry->code_object_generation = code_object_generation;
+        if (changed) {
+            _Py_hashtable_clear(self->code_object_cache);
+        }
+        self->cached_generation_interpreter_addr = interpreter_addr;
+        self->cached_code_object_generation = code_object_generation;
+    }
+
+#ifdef Py_GIL_DISABLED
+    uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, 
interp_state_buffer,
+                                                  
self->debug_offsets.interpreter_state.tlbc_generation);
+    if (current_tlbc_generation != self->tlbc_generation) {
+        self->tlbc_generation = current_tlbc_generation;
+        _Py_hashtable_clear(self->tlbc_cache);
+    }
+#endif
+}
+
+static int
+refresh_generation_caches_for_interpreter(
+    RemoteUnwinderObject *self,
+    uintptr_t interpreter_addr)
+{
+    char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
+    if (_Py_RemoteDebug_ReadRemoteMemory(
+            &self->handle,
+            interpreter_addr,
+            INTERP_STATE_BUFFER_SIZE,
+            interp_state_buffer) < 0) {
+        set_exception_cause(self, PyExc_RuntimeError,
+                            "Failed to read interpreter state buffer");
+        return -1;
+    }
+    refresh_generation_caches_from_interp_state(self, interpreter_addr, 
interp_state_buffer);
+    return 0;
+}
+
+static int
+read_interp_state_and_maybe_thread_frame(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t interpreter_addr,
+    char *interp_state_buffer,
+    char *tstate_buffer,
+    char *frame_buffer,
+    RemoteReadPrefetch *prefetch)
+{
+    prefetch->tstate = NULL;
+    prefetch->frame = NULL;
+    if (prefetch->tstate_addr != 0) {
+        size_t tstate_size = (size_t)unwinder->debug_offsets.thread_state.size;
+        _Py_RemoteReadSegment segments[3] = {
+            {interpreter_addr, interp_state_buffer, INTERP_STATE_BUFFER_SIZE},
+            {prefetch->tstate_addr, tstate_buffer, tstate_size},
+            {prefetch->frame_addr, frame_buffer, SIZEOF_INTERP_FRAME},
+        };
+        int nsegs = prefetch->frame_addr != 0 ? 3 : 2;
+        Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory(
+            &unwinder->handle, segments, nsegs);
+        int completed = 0;
+        if (nread >= (Py_ssize_t)INTERP_STATE_BUFFER_SIZE) {
+            completed = 1;
+            Py_ssize_t with_tstate = (Py_ssize_t)INTERP_STATE_BUFFER_SIZE
+                + (Py_ssize_t)tstate_size;
+            if (nread >= with_tstate) {
+                completed = 2;
+            }
+            if (nsegs == 3
+                    && nread == with_tstate + (Py_ssize_t)SIZEOF_INTERP_FRAME) 
{
+                completed = 3;
+            }
+        }
+        STATS_BATCHED_READ(unwinder, nsegs, completed);
+        if (completed >= 1) {
+            if (completed >= 2) {
+                prefetch->tstate = tstate_buffer;
+            }
+            if (completed >= 3) {
+                prefetch->frame = frame_buffer;
+            }
+            return 0;
+        }
+    }
+    return _Py_RemoteDebug_ReadRemoteMemory(
+        &unwinder->handle,
+        interpreter_addr,
+        INTERP_STATE_BUFFER_SIZE,
+        interp_state_buffer);
+}
+
 /*[clinic input]
 @permit_long_docstring_body
 @critical_section
@@ -537,15 +708,32 @@ 
_remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
     while (current_interpreter != 0) {
         // Read interpreter state to get the interpreter ID
         char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
-        if (_Py_RemoteDebug_PagedReadRemoteMemory(
-                &self->handle,
+        char prefetched_tstate[SIZEOF_THREAD_STATE];
+        char prefetched_frame[SIZEOF_INTERP_FRAME];
+        RemoteReadPrefetch prefetch = {0};
+        if (self->cache_frames) {
+            prefetch.tstate_addr = get_cached_tstate_for_interpreter(
+                self, current_interpreter);
+        }
+        if (prefetch.tstate_addr != 0) {
+            FrameCacheEntry *entry = frame_cache_find_by_tstate(self, 
prefetch.tstate_addr);
+            if (entry && entry->num_addrs > 0) {
+                prefetch.frame_addr = entry->addrs[0];
+            }
+        }
+
+        if (read_interp_state_and_maybe_thread_frame(
+                self,
                 current_interpreter,
-                INTERP_STATE_BUFFER_SIZE,
-                interp_state_buffer) < 0) {
+                interp_state_buffer,
+                prefetched_tstate,
+                prefetched_frame,
+                &prefetch) < 0) {
             set_exception_cause(self, PyExc_RuntimeError, "Failed to read 
interpreter state buffer");
             Py_CLEAR(result);
             goto exit;
         }
+        refresh_generation_caches_from_interp_state(self, current_interpreter, 
interp_state_buffer);
 
         uintptr_t gc_frame = 0;
         if (self->gc) {
@@ -557,25 +745,6 @@ 
_remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
         int64_t interpreter_id = GET_MEMBER(int64_t, interp_state_buffer,
                 self->debug_offsets.interpreter_state.id);
 
-        // Get code object generation from buffer
-        uint64_t code_object_generation = GET_MEMBER(uint64_t, 
interp_state_buffer,
-                self->debug_offsets.interpreter_state.code_object_generation);
-
-        if (code_object_generation != self->code_object_generation) {
-            self->code_object_generation = code_object_generation;
-            _Py_hashtable_clear(self->code_object_cache);
-        }
-
-#ifdef Py_GIL_DISABLED
-        // Check TLBC generation and invalidate cache if needed
-        uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, 
interp_state_buffer,
-                                                      
self->debug_offsets.interpreter_state.tlbc_generation);
-        if (current_tlbc_generation != self->tlbc_generation) {
-            self->tlbc_generation = current_tlbc_generation;
-            _Py_hashtable_clear(self->tlbc_cache);
-        }
-#endif
-
         // Create a list to hold threads for this interpreter
         PyObject *interpreter_threads = PyList_New(0);
         if (!interpreter_threads) {
@@ -611,6 +780,9 @@ 
_remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
             // Target specific thread (only process first interpreter)
             current_tstate = self->tstate_addr;
         }
+        if (current_tstate != 0 && self->cache_frames) {
+            set_cached_tstate_for_interpreter(self, current_interpreter, 
current_tstate);
+        }
 
         // Acquire main thread state information
         uintptr_t main_thread_tstate = GET_MEMBER(uintptr_t, 
interp_state_buffer,
@@ -621,7 +793,8 @@ 
_remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
             PyObject* frame_info = unwind_stack_for_thread(self, 
&current_tstate,
                                                            gil_holder_tstate,
                                                            gc_frame,
-                                                           main_thread_tstate);
+                                                           main_thread_tstate,
+                                                           &prefetch);
             if (!frame_info) {
                 // Check if this was an intentional skip due to mode-based 
filtering
                 if ((self->mode == PROFILING_MODE_CPU || self->mode == 
PROFILING_MODE_GIL ||
@@ -771,6 +944,9 @@ 
_remote_debugging_RemoteUnwinder_get_all_awaited_by_impl(RemoteUnwinderObject *s
     if (ensure_async_debug_offsets(self) < 0) {
         return NULL;
     }
+    if (refresh_generation_caches_for_interpreter(self, 
self->interpreter_addr) < 0) {
+        return NULL;
+    }
 
     PyObject *result = PyList_New(0);
     if (result == NULL) {
@@ -860,6 +1036,9 @@ 
_remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject
     if (ensure_async_debug_offsets(self) < 0) {
         return NULL;
     }
+    if (refresh_generation_caches_for_interpreter(self, 
self->interpreter_addr) < 0) {
+        return NULL;
+    }
 
     PyObject *result = PyList_New(0);
     if (result == NULL) {
@@ -904,8 +1083,15 @@ RemoteUnwinder was created with stats=True.
         - code_object_cache_hits: Code object cache hits
         - code_object_cache_misses: Code object cache misses
         - stale_cache_invalidations: Times stale cache entries were cleared
+        - batched_read_attempts: Batched remote-read attempts
+        - batched_read_successes: Attempts that read all requested segments
+        - batched_read_misses: Attempts that fell back or partially read
+        - batched_read_segments_requested: Segments requested by batched reads
+        - batched_read_segments_completed: Segments completed by batched reads
         - frame_cache_hit_rate: Percentage of samples that hit the cache
         - code_object_cache_hit_rate: Percentage of code object lookups that 
hit cache
+        - batched_read_success_rate: Percentage of batched reads that 
completed all segments
+        - batched_read_segment_completion_rate: Percentage of requested 
segments read by batched reads
 
 Raises:
     RuntimeError: If stats collection was not enabled (stats=False)
@@ -913,7 +1099,7 @@ RemoteUnwinder was created with stats=True.
 
 static PyObject *
 _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
-/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/
+/*[clinic end generated code: output=21e36477122be2a0 input=0392d62b278e9c35]*/
 {
     if (!self->collect_stats) {
         PyErr_SetString(PyExc_RuntimeError,
@@ -948,9 +1134,24 @@ 
_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
     ADD_STAT(code_object_cache_hits);
     ADD_STAT(code_object_cache_misses);
     ADD_STAT(stale_cache_invalidations);
+    ADD_STAT(batched_read_attempts);
+    ADD_STAT(batched_read_successes);
+    ADD_STAT(batched_read_misses);
+    ADD_STAT(batched_read_segments_requested);
+    ADD_STAT(batched_read_segments_completed);
 
 #undef ADD_STAT
 
+#define ADD_DERIVED_STAT(name, value) do { \
+    PyObject *val = PyFloat_FromDouble(value); \
+    if (!val || PyDict_SetItemString(result, name, val) < 0) { \
+        Py_XDECREF(val); \
+        Py_DECREF(result); \
+        return NULL; \
+    } \
+    Py_DECREF(val); \
+} while(0)
+
     // Calculate and add derived statistics
     // Hit rate is calculated as (hits + partial_hits) / total_cache_lookups
     double frame_cache_hit_rate = 0.0;
@@ -959,26 +1160,33 @@ 
_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
         frame_cache_hit_rate = 100.0 * (double)(self->stats.frame_cache_hits + 
self->stats.frame_cache_partial_hits)
                                / (double)total_cache_lookups;
     }
-    PyObject *hit_rate = PyFloat_FromDouble(frame_cache_hit_rate);
-    if (!hit_rate || PyDict_SetItemString(result, "frame_cache_hit_rate", 
hit_rate) < 0) {
-        Py_XDECREF(hit_rate);
-        Py_DECREF(result);
-        return NULL;
-    }
-    Py_DECREF(hit_rate);
+    ADD_DERIVED_STAT("frame_cache_hit_rate", frame_cache_hit_rate);
 
     double code_object_hit_rate = 0.0;
     uint64_t total_code_lookups = self->stats.code_object_cache_hits + 
self->stats.code_object_cache_misses;
     if (total_code_lookups > 0) {
         code_object_hit_rate = 100.0 * 
(double)self->stats.code_object_cache_hits / (double)total_code_lookups;
     }
-    PyObject *code_hit_rate = PyFloat_FromDouble(code_object_hit_rate);
-    if (!code_hit_rate || PyDict_SetItemString(result, 
"code_object_cache_hit_rate", code_hit_rate) < 0) {
-        Py_XDECREF(code_hit_rate);
-        Py_DECREF(result);
-        return NULL;
+    ADD_DERIVED_STAT("code_object_cache_hit_rate", code_object_hit_rate);
+
+    double batched_read_success_rate = 0.0;
+    if (self->stats.batched_read_attempts > 0) {
+        batched_read_success_rate =
+            100.0 * (double)self->stats.batched_read_successes
+            / (double)self->stats.batched_read_attempts;
     }
-    Py_DECREF(code_hit_rate);
+    ADD_DERIVED_STAT("batched_read_success_rate", batched_read_success_rate);
+
+    double batched_read_segment_completion_rate = 0.0;
+    if (self->stats.batched_read_segments_requested > 0) {
+        batched_read_segment_completion_rate =
+            100.0 * (double)self->stats.batched_read_segments_completed
+            / (double)self->stats.batched_read_segments_requested;
+    }
+    ADD_DERIVED_STAT("batched_read_segment_completion_rate",
+                     batched_read_segment_completion_rate);
+
+#undef ADD_DERIVED_STAT
 
     return result;
 }
diff --git a/Modules/_remote_debugging/threads.c 
b/Modules/_remote_debugging/threads.c
index d775234b8d78d7..ae120a26d5f4ec 100644
--- a/Modules/_remote_debugging/threads.c
+++ b/Modules/_remote_debugging/threads.c
@@ -289,28 +289,110 @@ typedef struct {
     unsigned int :24;
 } _thread_status;
 
+static int
+read_thread_state_and_maybe_frame(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t tstate_addr,
+    size_t tstate_size,
+    char *tstate_buffer,
+    uintptr_t predicted_frame_addr,
+    char *frame_buffer,
+    int *frame_read)
+{
+    *frame_read = 0;
+    if (predicted_frame_addr != 0) {
+        _Py_RemoteReadSegment segments[2] = {
+            {tstate_addr, tstate_buffer, tstate_size},
+            {predicted_frame_addr, frame_buffer, SIZEOF_INTERP_FRAME},
+        };
+        Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory(
+            &unwinder->handle, segments, 2);
+        int completed = 0;
+        if (nread >= (Py_ssize_t)tstate_size) {
+            completed = 1;
+            if (nread == (Py_ssize_t)(tstate_size + SIZEOF_INTERP_FRAME)) {
+                completed = 2;
+            }
+        }
+        STATS_BATCHED_READ(unwinder, 2, completed);
+        if (completed >= 1) {
+            *frame_read = completed == 2;
+            return 0;
+        }
+    }
+    return _Py_RemoteDebug_ReadRemoteMemory(
+        &unwinder->handle, tstate_addr, tstate_size, tstate_buffer);
+}
+
 PyObject*
 unwind_stack_for_thread(
     RemoteUnwinderObject *unwinder,
     uintptr_t *current_tstate,
     uintptr_t gil_holder_tstate,
     uintptr_t gc_frame,
-    uintptr_t main_thread_tstate
+    uintptr_t main_thread_tstate,
+    const RemoteReadPrefetch *prefetch
 ) {
     PyObject *frame_info = NULL;
     PyObject *thread_id = NULL;
     PyObject *result = NULL;
     StackChunkList chunks = {0};
 
-    char ts[SIZEOF_THREAD_STATE];
-    int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
-        &unwinder->handle, *current_tstate, 
(size_t)unwinder->debug_offsets.thread_state.size, ts);
-    if (bytes_read < 0) {
-        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read 
thread state");
-        goto error;
+    char local_ts[SIZEOF_THREAD_STATE];
+    char local_prefetched_frame[SIZEOF_INTERP_FRAME];
+    const char *ts;
+    RemoteReadPrefetch ctx_prefetch = {0};
+    if (prefetch->tstate && prefetch->tstate_addr == *current_tstate) {
+        ts = prefetch->tstate;
+        if (prefetch->frame) {
+            ctx_prefetch.frame = prefetch->frame;
+            ctx_prefetch.frame_addr = prefetch->frame_addr;
+        }
+    }
+    else if (unwinder->cache_frames) {
+        uintptr_t predicted_frame_addr = 0;
+        int have_prefetched_frame = 0;
+        FrameCacheEntry *entry = frame_cache_find_by_tstate(unwinder, 
*current_tstate);
+        if (entry && entry->num_addrs > 0) {
+            predicted_frame_addr = entry->addrs[0];
+        }
+
+        int rc = read_thread_state_and_maybe_frame(
+            unwinder,
+            *current_tstate,
+            (size_t)unwinder->debug_offsets.thread_state.size,
+            local_ts,
+            predicted_frame_addr,
+            local_prefetched_frame,
+            &have_prefetched_frame);
+        if (rc < 0) {
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read 
thread state");
+            goto error;
+        }
+        ts = local_ts;
+        if (have_prefetched_frame) {
+            ctx_prefetch.frame = local_prefetched_frame;
+            ctx_prefetch.frame_addr = predicted_frame_addr;
+        }
+    }
+    else {
+        int rc = _Py_RemoteDebug_ReadRemoteMemory(
+            &unwinder->handle,
+            *current_tstate,
+            (size_t)unwinder->debug_offsets.thread_state.size,
+            local_ts);
+        if (rc < 0) {
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read 
thread state");
+            goto error;
+        }
+        ts = local_ts;
     }
     STATS_INC(unwinder, memory_reads);
     STATS_ADD(unwinder, memory_bytes_read, 
unwinder->debug_offsets.thread_state.size);
+    if (ctx_prefetch.frame) {
+        STATS_INC(unwinder, memory_reads);
+        STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
+    }
 
     long tid = GET_MEMBER(long, ts, 
unwinder->debug_offsets.thread_state.native_thread_id);
 
@@ -432,9 +514,11 @@ unwind_stack_for_thread(
     uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
     FrameWalkContext ctx = {
         .frame_addr = frame_addr,
+        .thread_state_addr = *current_tstate,
         .base_frame_addr = base_frame_addr,
         .gc_frame = gc_frame,
         .chunks = &chunks,
+        .prefetch = ctx_prefetch,
         .frame_info = frame_info,
         .frame_addrs = addrs,
         .num_addrs = 0,
@@ -467,10 +551,18 @@ unwind_stack_for_thread(
 
     *current_tstate = GET_MEMBER(uintptr_t, ts, 
unwinder->debug_offsets.thread_state.next);
 
-    thread_id = PyLong_FromLongLong(tid);
+    if (unwinder->cache_frames) {
+        FrameCacheEntry *entry = frame_cache_find(unwinder, (uint64_t)tid);
+        if (entry && entry->thread_id_obj) {
+            thread_id = Py_NewRef(entry->thread_id_obj);
+        }
+    }
     if (thread_id == NULL) {
-        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create 
thread ID");
-        goto error;
+        thread_id = PyLong_FromLongLong(tid);
+        if (thread_id == NULL) {
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to 
create thread ID");
+            goto error;
+        }
     }
 
     RemoteDebuggingState *state = 
RemoteDebugging_GetStateFromObject((PyObject*)unwinder);
diff --git a/Python/remote_debug.h b/Python/remote_debug.h
index 6c089a834dcd40..7b2c4f3bcb8077 100644
--- a/Python/remote_debug.h
+++ b/Python/remote_debug.h
@@ -147,6 +147,7 @@ typedef struct {
     int memfd;
 #endif
     page_cache_entry_t pages[MAX_PAGES];
+    int page_cache_count;
     Py_ssize_t page_size;
 } proc_handle_t;
 
@@ -185,14 +186,16 @@ _Py_RemoteDebug_FreePageCache(proc_handle_t *handle)
         handle->pages[i].data = NULL;
         handle->pages[i].valid = 0;
     }
+    handle->page_cache_count = 0;
 }
 
 UNUSED static void
 _Py_RemoteDebug_ClearCache(proc_handle_t *handle)
 {
-    for (int i = 0; i < MAX_PAGES; i++) {
+    for (int i = 0; i < handle->page_cache_count; i++) {
         handle->pages[i].valid = 0;
     }
+    handle->page_cache_count = 0;
 }
 
 #if defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
@@ -222,6 +225,7 @@ _Py_RemoteDebug_InitProcHandle(proc_handle_t *handle, pid_t 
pid) {
     handle->memfd = -1;
 #endif
     handle->page_size = get_page_size();
+    handle->page_cache_count = 0;
     for (int i = 0; i < MAX_PAGES; i++) {
         handle->pages[i].data = NULL;
         handle->pages[i].valid = 0;
@@ -1287,8 +1291,9 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t 
*handle,
         return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
     }
 
-    // Search for valid cached page
-    for (int i = 0; i < MAX_PAGES; i++) {
+    // Search only the pages used since the last clear. The cache is cleared
+    // between profiler samples, so entries are packed at the front.
+    for (int i = 0; i < handle->page_cache_count; i++) {
         page_cache_entry_t *entry = &handle->pages[i];
         if (entry->valid && entry->page_addr == page_base) {
             memcpy(out, entry->data + offset_in_page, size);
@@ -1296,33 +1301,31 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t 
*handle,
         }
     }
 
-    // Find reusable slot
-    for (int i = 0; i < MAX_PAGES; i++) {
-        page_cache_entry_t *entry = &handle->pages[i];
-        if (!entry->valid) {
+    if (handle->page_cache_count < MAX_PAGES) {
+        page_cache_entry_t *entry = &handle->pages[handle->page_cache_count];
+        if (entry->data == NULL) {
+            entry->data = PyMem_RawMalloc(page_size);
             if (entry->data == NULL) {
-                entry->data = PyMem_RawMalloc(page_size);
-                if (entry->data == NULL) {
-                    PyErr_NoMemory();
-                    _set_debug_exception_cause(PyExc_MemoryError,
-                        "Cannot allocate %zu bytes for page cache entry "
-                        "during read from PID %d at address 0x%lx",
-                        page_size, handle->pid, addr);
-                    return -1;
-                }
-            }
-
-            if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, 
entry->data) < 0) {
-                // Try to just copy the exact amount as a fallback
-                PyErr_Clear();
-                goto fallback;
+                PyErr_NoMemory();
+                _set_debug_exception_cause(PyExc_MemoryError,
+                    "Cannot allocate %zu bytes for page cache entry "
+                    "during read from PID %d at address 0x%lx",
+                    page_size, handle->pid, addr);
+                return -1;
             }
+        }
 
-            entry->page_addr = page_base;
-            entry->valid = 1;
-            memcpy(out, entry->data + offset_in_page, size);
-            return 0;
+        if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, 
entry->data) < 0) {
+            // Try to just copy the exact amount as a fallback
+            PyErr_Clear();
+            goto fallback;
         }
+
+        entry->page_addr = page_base;
+        entry->valid = 1;
+        handle->page_cache_count++;
+        memcpy(out, entry->data + offset_in_page, size);
+        return 0;
     }
 
 fallback:
@@ -1330,6 +1333,49 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t 
*handle,
     return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
 }
 
+typedef struct {
+    uintptr_t remote_addr;
+    void *local_buf;
+    size_t size;
+} _Py_RemoteReadSegment;
+
+#define _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS 4
+
+// Batched read of multiple remote regions in a single syscall when supported.
+// Returns total bytes read (>= 0) on success, -1 if batched reads are
+// unavailable or the syscall failed. Callers compare the return value against
+// cumulative segment sizes to determine which segments were fully populated.
+UNUSED static Py_ssize_t
+_Py_RemoteDebug_BatchedReadRemoteMemory(
+    proc_handle_t *handle,
+    const _Py_RemoteReadSegment *segments,
+    int nsegs)
+{
+#if defined(__linux__) && HAVE_PROCESS_VM_READV
+    if (handle->memfd == -1
+        && nsegs > 0
+        && nsegs <= _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS) {
+        struct iovec local[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS];
+        struct iovec remote[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS];
+        for (int i = 0; i < nsegs; i++) {
+            local[i].iov_base = segments[i].local_buf;
+            local[i].iov_len = segments[i].size;
+            remote[i].iov_base = (void *)segments[i].remote_addr;
+            remote[i].iov_len = segments[i].size;
+        }
+        ssize_t nread = process_vm_readv(handle->pid, local, nsegs, remote, 
nsegs, 0);
+        if (nread >= 0) {
+            return (Py_ssize_t)nread;
+        }
+    }
+#else
+    (void)handle;
+    (void)segments;
+    (void)nsegs;
+#endif
+    return -1;
+}
+
 UNUSED static int
 _Py_RemoteDebug_ReadDebugOffsets(
     proc_handle_t *handle,
diff --git a/Tools/inspection/benchmark_external_inspection.py 
b/Tools/inspection/benchmark_external_inspection.py
index fee3435496da0b..8e367422a961da 100644
--- a/Tools/inspection/benchmark_external_inspection.py
+++ b/Tools/inspection/benchmark_external_inspection.py
@@ -151,6 +151,45 @@ def create_threads(n):
     time.sleep(0.05)
 '''
 
+ASYNC_CODE = '''\
+import asyncio
+import contextlib
+import math
+
+def compute_slice(seed):
+    result = 0.0
+    for i in range(2000):
+        result += math.sin(seed + i) * math.sqrt(i + 1)
+    return result
+
+async def leaf_task(seed):
+    total = 0.0
+    while True:
+        total += compute_slice(seed)
+        await asyncio.sleep(0)
+
+async def parent_task(seed):
+    child = asyncio.create_task(leaf_task(seed + 1000), name=f"leaf-{seed}")
+    try:
+        while True:
+            compute_slice(seed)
+            await asyncio.sleep(0.001)
+    finally:
+        child.cancel()
+        with contextlib.suppress(asyncio.CancelledError):
+            await child
+
+async def main():
+    tasks = [
+        asyncio.create_task(parent_task(i), name=f"parent-{i}")
+        for i in range(8)
+    ]
+    await asyncio.gather(*tasks)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+'''
+
 CODE_EXAMPLES = {
     "basic": {
         "code": CODE,
@@ -164,10 +203,29 @@ def create_threads(n):
         "code": CODE_WITH_TONS_OF_THREADS,
         "description": "Tons of threads doing mixed CPU/IO work",
     },
+    "asyncio": {
+        "code": ASYNC_CODE,
+        "description": "Asyncio tasks with active and awaited coroutine 
chains",
+    },
+}
+
+OPERATIONS = {
+    "stack_trace": {
+        "method": "get_stack_trace",
+        "label": "get_stack_trace()",
+    },
+    "async_stack_trace": {
+        "method": "get_async_stack_trace",
+        "label": "get_async_stack_trace()",
+    },
+    "all_awaited_by": {
+        "method": "get_all_awaited_by",
+        "label": "get_all_awaited_by()",
+    },
 }
 
 
-def benchmark(unwinder, duration_seconds=10, blocking=False):
+def benchmark(unwinder, duration_seconds=10, blocking=False, 
operation="stack_trace"):
     """Benchmark mode - measure raw sampling speed for specified duration"""
     sample_count = 0
     fail_count = 0
@@ -175,11 +233,14 @@ def benchmark(unwinder, duration_seconds=10, 
blocking=False):
     start_time = time.perf_counter()
     end_time = start_time + duration_seconds
     total_attempts = 0
+    operation_info = OPERATIONS[operation]
+    operation_method = getattr(unwinder, operation_info["method"])
 
     colors = get_colors(can_colorize())
 
     print(
-        f"{colors.BOLD_BLUE}Benchmarking sampling speed for {duration_seconds} 
seconds...{colors.RESET}"
+        f"{colors.BOLD_BLUE}Benchmarking {operation_info['label']} speed "
+        f"for {duration_seconds} seconds...{colors.RESET}"
     )
 
     try:
@@ -190,8 +251,8 @@ def benchmark(unwinder, duration_seconds=10, 
blocking=False):
                 if blocking:
                     unwinder.pause_threads()
                 try:
-                    stack_trace = unwinder.get_stack_trace()
-                    if stack_trace:
+                    sample = operation_method()
+                    if sample:
                         sample_count += 1
                 finally:
                     if blocking:
@@ -239,6 +300,7 @@ def benchmark(unwinder, duration_seconds=10, 
blocking=False):
             (sample_count / total_attempts) * 100 if total_attempts > 0 else 0
         ),
         "total_work_time": total_work_time,
+        "operation": operation_info["label"],
         "avg_work_time_us": (
             (total_work_time / total_attempts) * 1e6 if total_attempts > 0 
else 0
         ),
@@ -252,7 +314,7 @@ def print_benchmark_results(results):
     colors = get_colors(can_colorize())
 
     print(f"\n{colors.BOLD_GREEN}{'='*60}{colors.RESET}")
-    print(f"{colors.BOLD_GREEN}get_stack_trace() Benchmark 
Results{colors.RESET}")
+    print(f"{colors.BOLD_GREEN}{results['operation']} Benchmark 
Results{colors.RESET}")
     print(f"{colors.BOLD_GREEN}{'='*60}{colors.RESET}")
 
     # Basic statistics
@@ -329,6 +391,8 @@ def parse_arguments():
   %(prog)s -d 60                     # Run basic benchmark for 60 seconds
   %(prog)s --code deep_static        # Run deep static call stack benchmark
   %(prog)s --code deep_static -d 30  # Run deep static benchmark for 30 seconds
+  %(prog)s --operation async_stack_trace
+  %(prog)s --operation all_awaited_by
 
 Available code examples:
 {examples_desc}
@@ -348,8 +412,15 @@ def parse_arguments():
         "--code",
         "-c",
         choices=list(CODE_EXAMPLES.keys()),
-        default="basic",
-        help="Code example to benchmark (default: basic)",
+        default=None,
+        help="Code example to benchmark (default: basic, or asyncio for async 
operations)",
+    )
+
+    parser.add_argument(
+        "--operation",
+        choices=list(OPERATIONS.keys()),
+        default="stack_trace",
+        help="Remote unwinder operation to benchmark (default: stack_trace)",
     )
 
     parser.add_argument(
@@ -365,7 +436,10 @@ def parse_arguments():
         help="Stop all threads before sampling for consistent snapshots",
     )
 
-    return parser.parse_args()
+    args = parser.parse_args()
+    if args.code is None:
+        args.code = "asyncio" if args.operation != "stack_trace" else "basic"
+    return args
 
 
 def create_target_process(temp_file, code_example="basic"):
@@ -420,6 +494,9 @@ def main():
     print(
         f"{colors.CYAN}Benchmark Duration:{colors.RESET} 
{colors.YELLOW}{args.duration}{colors.RESET} seconds"
     )
+    print(
+        f"{colors.CYAN}Operation:{colors.RESET} 
{colors.GREEN}{OPERATIONS[args.operation]['label']}{colors.RESET}"
+    )
     print(
         f"{colors.CYAN}Blocking Mode:{colors.RESET} {colors.GREEN if 
args.blocking else colors.YELLOW}{'enabled' if args.blocking else 
'disabled'}{colors.RESET}"
     )
@@ -451,7 +528,12 @@ def main():
                     unwinder = _remote_debugging.RemoteUnwinder(
                         process.pid, cache_frames=True, **kwargs
                     )
-                    results = benchmark(unwinder, 
duration_seconds=args.duration, blocking=args.blocking)
+                    results = benchmark(
+                        unwinder,
+                        duration_seconds=args.duration,
+                        blocking=args.blocking,
+                        operation=args.operation,
+                    )
                 finally:
                     cleanup_process(process, temp_file_path)
 

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]

Reply via email to