https://github.com/python/cpython/commit/cba2974a545c9d35f15f25575c214fbda601a47a
commit: cba2974a545c9d35f15f25575c214fbda601a47a
branch: 3.13
author: Pablo Galindo Salgado <pablog...@gmail.com>
committer: pablogsal <pablog...@gmail.com>
date: 2025-07-11T14:02:19Z
summary:

[3.13] gh-136541: Fix several problems of perf trampolines in x86_64 and 
aarch64 (GH-136500) (#136545)

This commit fixes the following problems:

* The x86_64 trampolines are not preserving frame pointers
* The hardcoded offsets to the code segment from the FDE only worked properly 
for x64_64
* The CIE data was not following conventions of aarch64
* The eh_frame for aarch64 was not fully correct
(cherry picked from commit 236f733d8ffb3d587e1167fa0a0248c24512e7fd)

files:
A Misc/NEWS.d/next/Core and 
Builtins/2025-07-11-13-45-48.gh-issue-136541.uZ_-Ju.rst
M Python/asm_trampoline.S
M Python/perf_jit_trampoline.c
M Python/perf_trampoline.c

diff --git a/Misc/NEWS.d/next/Core and 
Builtins/2025-07-11-13-45-48.gh-issue-136541.uZ_-Ju.rst b/Misc/NEWS.d/next/Core 
and Builtins/2025-07-11-13-45-48.gh-issue-136541.uZ_-Ju.rst
new file mode 100644
index 00000000000000..af9b94ad0613d7
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and 
Builtins/2025-07-11-13-45-48.gh-issue-136541.uZ_-Ju.rst 
@@ -0,0 +1,3 @@
+Fix some issues with the perf trampolines on x86-64 and aarch64.  The
+trampolines were not being generated correctly for some cases, which could
+lead to the perf integration not working correctly. Patch by Pablo Galindo.
diff --git a/Python/asm_trampoline.S b/Python/asm_trampoline.S
index 616752459ba4d9..a14e68c0e81932 100644
--- a/Python/asm_trampoline.S
+++ b/Python/asm_trampoline.S
@@ -12,9 +12,10 @@ _Py_trampoline_func_start:
 #if defined(__CET__) && (__CET__ & 1)
     endbr64
 #endif
-    sub    $8, %rsp
-    call    *%rcx
-    add    $8, %rsp
+    push   %rbp
+    mov    %rsp, %rbp
+    call   *%rcx
+    pop    %rbp
     ret
 #endif // __x86_64__
 #if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c
index 6547a6cca79ec5..a44a74f58df85a 100644
--- a/Python/perf_jit_trampoline.c
+++ b/Python/perf_jit_trampoline.c
@@ -96,10 +96,9 @@
  * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
  * /tmp/jitted-PID-1.so:                                       
[headers][.text][unwind_info][padding]
  *
- * The padding size (0x100) is chosen to accommodate typical unwind info sizes
- * while maintaining 16-byte alignment requirements.
+ * The padding size is now calculated automatically during initialization
+ * based on the actual unwind information requirements.
  */
-#define PERF_JIT_CODE_PADDING 0x100
 
 /* Convenient access to the global trampoline API state */
 #define trampoline_api _PyRuntime.ceval.perf.trampoline_api
@@ -400,10 +399,12 @@ enum {
     DWRF_CFA_nop = 0x0,                    // No operation
     DWRF_CFA_offset_extended = 0x5,        // Extended offset instruction
     DWRF_CFA_def_cfa = 0xc,               // Define CFA rule
+    DWRF_CFA_def_cfa_register = 0xd,      // Define CFA register
     DWRF_CFA_def_cfa_offset = 0xe,        // Define CFA offset
     DWRF_CFA_offset_extended_sf = 0x11,   // Extended signed offset
     DWRF_CFA_advance_loc = 0x40,          // Advance location counter
-    DWRF_CFA_offset = 0x80                // Simple offset instruction
+    DWRF_CFA_offset = 0x80,               // Simple offset instruction
+    DWRF_CFA_restore = 0xc0               // Restore register
 };
 
 /* DWARF Exception Handling pointer encodings */
@@ -518,6 +519,7 @@ typedef struct ELFObjectContext {
     uint8_t* p;            // Current write position in buffer
     uint8_t* startp;       // Start of buffer (for offset calculations)
     uint8_t* eh_frame_p;   // Start of EH frame data (for relative offsets)
+    uint8_t* fde_p;        // Start of FDE data (for PC-relative calculations)
     uint32_t code_size;    // Size of the code being described
 } ELFObjectContext;
 
@@ -642,6 +644,8 @@ static void elfctx_append_uleb128(ELFObjectContext* ctx, 
uint32_t v) {
 //                              DWARF EH FRAME GENERATION
 // 
=============================================================================
 
+static void elf_init_ehframe(ELFObjectContext* ctx);
+
 /*
  * Initialize DWARF .eh_frame section for a code region
  *
@@ -656,6 +660,23 @@ static void elfctx_append_uleb128(ELFObjectContext* ctx, 
uint32_t v) {
  * Args:
  *   ctx: ELF object context containing code size and buffer pointers
  */
+static size_t calculate_eh_frame_size(void) {
+    /* Calculate the EH frame size for the trampoline function */
+    extern void *_Py_trampoline_func_start;
+    extern void *_Py_trampoline_func_end;
+
+    size_t code_size = (char*)&_Py_trampoline_func_end - 
(char*)&_Py_trampoline_func_start;
+
+    ELFObjectContext ctx;
+    char buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
+    ctx.code_size = code_size;
+    ctx.startp = ctx.p = (uint8_t*)buffer;
+    ctx.fde_p = NULL;
+
+    elf_init_ehframe(&ctx);
+    return ctx.p - ctx.startp;
+}
+
 static void elf_init_ehframe(ELFObjectContext* ctx) {
     uint8_t* p = ctx->p;
     uint8_t* framep = p;  // Remember start of frame data
@@ -783,7 +804,7 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
     *
     *     DWRF_SECTION(FDE,
     *         DWRF_U32((uint32_t)(p - framep));       // Offset to CIE 
(relative from here)
-    *         DWRF_U32(-0x30);                        // Initial PC-relative 
location of the code
+    *         DWRF_U32(pc_relative_offset);           // PC-relative location 
of the code (calculated dynamically)
     *         DWRF_U32(ctx->code_size);               // Code range covered by 
this FDE
     *         DWRF_U8(0);                             // Augmentation data 
length (none)
     *
@@ -829,19 +850,31 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
         DWRF_U32(0);                           // CIE ID (0 indicates this is 
a CIE)
         DWRF_U8(DWRF_CIE_VERSION);            // CIE version (1)
         DWRF_STR("zR");                       // Augmentation string ("zR" = 
has LSDA)
-        DWRF_UV(1);                           // Code alignment factor
+#ifdef __x86_64__
+        DWRF_UV(1);                           // Code alignment factor 
(x86_64: 1 byte)
+#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+        DWRF_UV(4);                           // Code alignment factor 
(AArch64: 4 bytes per instruction)
+#endif
         DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor 
(negative)
         DWRF_U8(DWRF_REG_RA);                 // Return address register number
         DWRF_UV(1);                           // Augmentation data length
         DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding
 
         /* Initial CFI instructions - describe default calling convention */
+#ifdef __x86_64__
+        /* x86_64 initial CFI state */
         DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame 
Address)
         DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
         DWRF_UV(sizeof(uintptr_t));           // CFA = SP + pointer_size
         DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved
         DWRF_UV(1);                           // At offset 1 from CFA
-
+#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+        /* AArch64 initial CFI state */
+        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame 
Address)
+        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
+        DWRF_UV(0);                           // CFA = SP + 0 (AArch64 starts 
with offset 0)
+        // No initial register saves in AArch64 CIE
+#endif
         DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
     )
 
@@ -852,11 +885,15 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
      *
      * The FDE describes unwinding information specific to this function.
      * It references the CIE and provides function-specific CFI instructions.
+     *
+     * The PC-relative offset is calculated after the entire EH frame is built
+     * to ensure accurate positioning relative to the synthesized DSO layout.
      */
     DWRF_SECTION(FDE,
         DWRF_U32((uint32_t)(p - framep));     // Offset to CIE (backwards 
reference)
-        DWRF_U32(-0x30);                      // Machine code offset relative 
to .text
-        DWRF_U32(ctx->code_size);             // Address range covered by this 
FDE (code lenght)
+        ctx->fde_p = p;                        // Remember where PC offset 
field is located for later calculation
+        DWRF_U32(0);                           // Placeholder for PC-relative 
offset (calculated at end of elf_init_ehframe)
+        DWRF_U32(ctx->code_size);             // Address range covered by this 
FDE (code length)
         DWRF_U8(0);                           // Augmentation data length 
(none)
 
         /*
@@ -867,32 +904,36 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
          * conventions and register usage patterns.
          */
 #ifdef __x86_64__
-        /* x86_64 calling convention unwinding rules */
+        /* x86_64 calling convention unwinding rules with frame pointer */
 #  if defined(__CET__) && (__CET__ & 1)
-        DWRF_U8(DWRF_CFA_advance_loc | 8);    // Advance location by 8 bytes 
when CET protection is enabled
-#  else
-        DWRF_U8(DWRF_CFA_advance_loc | 4);    // Advance location by 4 bytes
+        DWRF_U8(DWRF_CFA_advance_loc | 4);    // Advance past endbr64 (4 bytes)
 #  endif
-        DWRF_U8(DWRF_CFA_def_cfa_offset);     // Redefine CFA offset
+        DWRF_U8(DWRF_CFA_advance_loc | 1);    // Advance past push %rbp (1 
byte)
+        DWRF_U8(DWRF_CFA_def_cfa_offset);     // def_cfa_offset 16
         DWRF_UV(16);                          // New offset: SP + 16
-        DWRF_U8(DWRF_CFA_advance_loc | 6);    // Advance location by 6 bytes
-        DWRF_U8(DWRF_CFA_def_cfa_offset);     // Redefine CFA offset
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16
+        DWRF_UV(2);                           // Offset factor: 2 * 8 = 16 
bytes
+        DWRF_U8(DWRF_CFA_advance_loc | 3);    // Advance past mov %rsp,%rbp (3 
bytes)
+        DWRF_U8(DWRF_CFA_def_cfa_register);   // def_cfa_register r6
+        DWRF_UV(DWRF_REG_BP);                 // Use base pointer register
+        DWRF_U8(DWRF_CFA_advance_loc | 3);    // Advance past call *%rcx (2 
bytes) + pop %rbp (1 byte) = 3
+        DWRF_U8(DWRF_CFA_def_cfa);            // def_cfa r7 ofs 8
+        DWRF_UV(DWRF_REG_SP);                 // Use stack pointer register
         DWRF_UV(8);                           // New offset: SP + 8
 #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
         /* AArch64 calling convention unwinding rules */
-        DWRF_U8(DWRF_CFA_advance_loc | 1);        // Advance location by 1 
instruction (stp x29, x30)
-        DWRF_U8(DWRF_CFA_def_cfa_offset);         // Redefine CFA offset
-        DWRF_UV(16);                              // CFA = SP + 16 (stack 
pointer after push)
-        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // Frame pointer (x29) saved
-        DWRF_UV(2);                               // At offset 2 from CFA (2 * 
8 = 16 bytes)
-        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // Link register (x30) saved
-        DWRF_UV(1);                               // At offset 1 from CFA (1 * 
8 = 8 bytes)
-        DWRF_U8(DWRF_CFA_advance_loc | 3);        // Advance by 3 instructions 
(mov x16, x3; mov x29, sp; ldp...)
-        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // Restore frame pointer 
(x29)
-        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // Restore link register 
(x30)
-        DWRF_U8(DWRF_CFA_def_cfa_offset);         // Final CFA adjustment
-        DWRF_UV(0);                               // CFA = SP + 0 (stack 
restored)
-
+        DWRF_U8(DWRF_CFA_advance_loc | 1);        // Advance by 1 instruction 
(4 bytes)
+        DWRF_U8(DWRF_CFA_def_cfa_offset);         // CFA = SP + 16
+        DWRF_UV(16);                              // Stack pointer moved by 16 
bytes
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // x29 (frame pointer) saved
+        DWRF_UV(2);                               // At CFA-16 (2 * 8 = 16 
bytes from CFA)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // x30 (link register) saved
+        DWRF_UV(1);                               // At CFA-8 (1 * 8 = 8 bytes 
from CFA)
+        DWRF_U8(DWRF_CFA_advance_loc | 3);        // Advance by 3 instructions 
(12 bytes)
+        DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA);  // Restore x30 - NO 
DWRF_UV() after this!
+        DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP);  // Restore x29 - NO 
DWRF_UV() after this!
+        DWRF_U8(DWRF_CFA_def_cfa_offset);         // CFA = SP + 0 (stack 
restored)
+        DWRF_UV(0);                               // Back to original stack 
position
 #else
 #    error "Unsupported target architecture"
 #endif
@@ -901,6 +942,58 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
     )
 
     ctx->p = p;  // Update context pointer to end of generated data
+
+    /* Calculate and update the PC-relative offset in the FDE
+     *
+     * When perf processes the jitdump, it creates a synthesized DSO with this 
layout:
+     *
+     *     Synthesized DSO Memory Layout:
+     *     ┌─────────────────────────────────────────────────────────────┐ < 
code_start
+     *     │                        Code Section                         │
+     *     │                    (round_up(code_size, 8) bytes)           │
+     *     ├─────────────────────────────────────────────────────────────┤ < 
start of EH frame data
+     *     │                      EH Frame Data                          │
+     *     │  ┌─────────────────────────────────────────────────────┐    │
+     *     │  │                 CIE data                            │    │
+     *     │  └─────────────────────────────────────────────────────┘    │
+     *     │  ┌─────────────────────────────────────────────────────┐    │
+     *     │  │ FDE Header:                                         │    │
+     *     │  │   - CIE offset (4 bytes)                            │    │
+     *     │  │   - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> 
points to code_start
+     *     │  │   - address range (4 bytes)                         │    │   
(this specific field)
+     *     │  │ CFI Instructions...                                 │    │
+     *     │  └─────────────────────────────────────────────────────┘    │
+     *     ├─────────────────────────────────────────────────────────────┤ < 
reference_point
+     *     │                    EhFrameHeader                            │
+     *     │                 (navigation metadata)                       │
+     *     └─────────────────────────────────────────────────────────────┘
+     *
+     * The PC offset field in the FDE must contain the distance from itself to 
code_start:
+     *
+     *   distance = code_start - fde_pc_field
+     *
+     * Where:
+     *   fde_pc_field_location = reference_point - eh_frame_size + 
fde_offset_in_frame
+     *   code_start_location = reference_point - eh_frame_size - 
round_up(code_size, 8)
+     *
+     * Therefore:
+     *   distance = code_start_location - fde_pc_field_location
+     *            = (ref - eh_frame_size - rounded_code_size) - (ref - 
eh_frame_size + fde_offset_in_frame)
+     *            = -rounded_code_size - fde_offset_in_frame
+     *            = -(round_up(code_size, 8) + fde_offset_in_frame)
+     *
+     * Note: fde_offset_in_frame is the offset from EH frame start to the PC 
offset field,
+     *
+     */
+    if (ctx->fde_p != NULL) {
+        int32_t fde_offset_in_frame = (ctx->fde_p - ctx->startp);
+        int32_t rounded_code_size = round_up(ctx->code_size, 8);
+        int32_t pc_relative_offset = -(rounded_code_size + 
fde_offset_in_frame);
+
+
+        // Update the PC-relative offset in the FDE
+        *(int32_t*)ctx->fde_p = pc_relative_offset;
+    }
 }
 
 // 
=============================================================================
@@ -1001,8 +1094,10 @@ static void* perf_map_jit_init(void) {
     /* Initialize code ID counter */
     perf_jit_map_state.code_id = 0;
 
-    /* Configure trampoline API with padding information */
-    trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
+    /* Calculate padding size based on actual unwind info requirements */
+    size_t eh_frame_size = calculate_eh_frame_size();
+    size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
+    trampoline_api.code_padding = round_up(unwind_data_size, 16);
 
     return &perf_jit_map_state;
 }
@@ -1091,6 +1186,7 @@ static void perf_map_jit_write_entry(void *state, const 
void *code_addr,
     char buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
     ctx.code_size = code_size;
     ctx.startp = ctx.p = (uint8_t*)buffer;
+    ctx.fde_p = NULL;  // Initialize to NULL, will be set when FDE is written
 
     /* Generate EH frame (Exception Handling frame) data */
     elf_init_ehframe(&ctx);
@@ -1109,7 +1205,7 @@ static void perf_map_jit_write_entry(void *state, const 
void *code_addr,
     ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
 
     /* Verify we don't exceed our padding budget */
-    assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING);
+    assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding);
 
     ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
     ev2.mapped_size = round_up(ev2.unwind_data_size, 16);  // 16-byte alignment
@@ -1261,4 +1357,4 @@ _PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
     &perf_map_jit_fini,        // Cleanup function
 };
 
-#endif /* PY_HAVE_PERF_TRAMPOLINE */
\ No newline at end of file
+#endif /* PY_HAVE_PERF_TRAMPOLINE */
diff --git a/Python/perf_trampoline.c b/Python/perf_trampoline.c
index f144f7d436fe68..b1b787fc27892e 100644
--- a/Python/perf_trampoline.c
+++ b/Python/perf_trampoline.c
@@ -186,6 +186,8 @@ struct code_arena_st {
         *prev;  // Pointer to the arena  or NULL if this is the first arena.
 };
 
+#define CODE_ALIGNMENT 32
+
 typedef struct code_arena_st code_arena_t;
 typedef struct trampoline_api_st trampoline_api_t;
 
@@ -291,7 +293,9 @@ new_code_arena(void)
     void *start = &_Py_trampoline_func_start;
     void *end = &_Py_trampoline_func_end;
     size_t code_size = end - start;
-    size_t chunk_size = round_up(code_size + trampoline_api.code_padding, 16);
+    size_t unaligned_size = code_size + trampoline_api.code_padding;
+    size_t chunk_size = round_up(unaligned_size, CODE_ALIGNMENT);
+    assert(chunk_size % CODE_ALIGNMENT == 0);
     // TODO: Check the effect of alignment of the code chunks. Initial 
investigation
     // showed that this has no effect on performance in x86-64 or aarch64 and 
the current
     // version has the advantage that the unwinder in GDB can unwind across 
JIT-ed code.
@@ -356,7 +360,9 @@ static inline py_trampoline
 code_arena_new_code(code_arena_t *code_arena)
 {
     py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
-    size_t total_code_size = round_up(code_arena->code_size + 
trampoline_api.code_padding, 16);
+    size_t total_code_size = round_up(code_arena->code_size + 
trampoline_api.code_padding,
+                                  CODE_ALIGNMENT);
+    assert(total_code_size % CODE_ALIGNMENT == 0);
     code_arena->size_left -= total_code_size;
     code_arena->current_addr += total_code_size;
     return trampoline;
@@ -489,9 +495,6 @@ _PyPerfTrampoline_Init(int activate)
     }
     else {
         tstate->interp->eval_frame = py_trampoline_evaluator;
-        if (new_code_arena() < 0) {
-            return -1;
-        }
         extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
         if (extra_code_index == -1) {
             return -1;
@@ -499,6 +502,9 @@ _PyPerfTrampoline_Init(int activate)
         if (trampoline_api.state == NULL && trampoline_api.init_state != NULL) 
{
             trampoline_api.state = trampoline_api.init_state();
         }
+        if (new_code_arena() < 0) {
+            return -1;
+        }
         perf_status = PERF_STATUS_OK;
     }
 #endif

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-le...@python.org
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: arch...@mail-archive.com

Reply via email to