https://github.com/python/cpython/commit/cba2974a545c9d35f15f25575c214fbda601a47a commit: cba2974a545c9d35f15f25575c214fbda601a47a branch: 3.13 author: Pablo Galindo Salgado <pablog...@gmail.com> committer: pablogsal <pablog...@gmail.com> date: 2025-07-11T14:02:19Z summary:
[3.13] gh-136541: Fix several problems of perf trampolines in x86_64 and aarch64 (GH-136500) (#136545) This commit fixes the following problems: * The x86_64 trampolines are not preserving frame pointers * The hardcoded offsets to the code segment from the FDE only worked properly for x64_64 * The CIE data was not following conventions of aarch64 * The eh_frame for aarch64 was not fully correct (cherry picked from commit 236f733d8ffb3d587e1167fa0a0248c24512e7fd) files: A Misc/NEWS.d/next/Core and Builtins/2025-07-11-13-45-48.gh-issue-136541.uZ_-Ju.rst M Python/asm_trampoline.S M Python/perf_jit_trampoline.c M Python/perf_trampoline.c diff --git a/Misc/NEWS.d/next/Core and Builtins/2025-07-11-13-45-48.gh-issue-136541.uZ_-Ju.rst b/Misc/NEWS.d/next/Core and Builtins/2025-07-11-13-45-48.gh-issue-136541.uZ_-Ju.rst new file mode 100644 index 00000000000000..af9b94ad0613d7 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2025-07-11-13-45-48.gh-issue-136541.uZ_-Ju.rst @@ -0,0 +1,3 @@ +Fix some issues with the perf trampolines on x86-64 and aarch64. The +trampolines were not being generated correctly for some cases, which could +lead to the perf integration not working correctly. Patch by Pablo Galindo. diff --git a/Python/asm_trampoline.S b/Python/asm_trampoline.S index 616752459ba4d9..a14e68c0e81932 100644 --- a/Python/asm_trampoline.S +++ b/Python/asm_trampoline.S @@ -12,9 +12,10 @@ _Py_trampoline_func_start: #if defined(__CET__) && (__CET__ & 1) endbr64 #endif - sub $8, %rsp - call *%rcx - add $8, %rsp + push %rbp + mov %rsp, %rbp + call *%rcx + pop %rbp ret #endif // __x86_64__ #if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c index 6547a6cca79ec5..a44a74f58df85a 100644 --- a/Python/perf_jit_trampoline.c +++ b/Python/perf_jit_trampoline.c @@ -96,10 +96,9 @@ * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding] * /tmp/jitted-PID-1.so: [headers][.text][unwind_info][padding] * - * The padding size (0x100) is chosen to accommodate typical unwind info sizes - * while maintaining 16-byte alignment requirements. + * The padding size is now calculated automatically during initialization + * based on the actual unwind information requirements. */ -#define PERF_JIT_CODE_PADDING 0x100 /* Convenient access to the global trampoline API state */ #define trampoline_api _PyRuntime.ceval.perf.trampoline_api @@ -400,10 +399,12 @@ enum { DWRF_CFA_nop = 0x0, // No operation DWRF_CFA_offset_extended = 0x5, // Extended offset instruction DWRF_CFA_def_cfa = 0xc, // Define CFA rule + DWRF_CFA_def_cfa_register = 0xd, // Define CFA register DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset DWRF_CFA_advance_loc = 0x40, // Advance location counter - DWRF_CFA_offset = 0x80 // Simple offset instruction + DWRF_CFA_offset = 0x80, // Simple offset instruction + DWRF_CFA_restore = 0xc0 // Restore register }; /* DWARF Exception Handling pointer encodings */ @@ -518,6 +519,7 @@ typedef struct ELFObjectContext { uint8_t* p; // Current write position in buffer uint8_t* startp; // Start of buffer (for offset calculations) uint8_t* eh_frame_p; // Start of EH frame data (for relative offsets) + uint8_t* fde_p; // Start of FDE data (for PC-relative calculations) uint32_t code_size; // Size of the code being described } ELFObjectContext; @@ -642,6 +644,8 @@ static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { // DWARF EH FRAME GENERATION // ============================================================================= +static void elf_init_ehframe(ELFObjectContext* ctx); + /* * Initialize DWARF .eh_frame section for a code region * @@ -656,6 +660,23 @@ static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { * Args: * ctx: ELF object context containing code size and buffer pointers */ +static size_t calculate_eh_frame_size(void) { + /* Calculate the EH frame size for the trampoline function */ + extern void *_Py_trampoline_func_start; + extern void *_Py_trampoline_func_end; + + size_t code_size = (char*)&_Py_trampoline_func_end - (char*)&_Py_trampoline_func_start; + + ELFObjectContext ctx; + char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) + ctx.code_size = code_size; + ctx.startp = ctx.p = (uint8_t*)buffer; + ctx.fde_p = NULL; + + elf_init_ehframe(&ctx); + return ctx.p - ctx.startp; +} + static void elf_init_ehframe(ELFObjectContext* ctx) { uint8_t* p = ctx->p; uint8_t* framep = p; // Remember start of frame data @@ -783,7 +804,7 @@ static void elf_init_ehframe(ELFObjectContext* ctx) { * * DWRF_SECTION(FDE, * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here) - * DWRF_U32(-0x30); // Initial PC-relative location of the code + * DWRF_U32(pc_relative_offset); // PC-relative location of the code (calculated dynamically) * DWRF_U32(ctx->code_size); // Code range covered by this FDE * DWRF_U8(0); // Augmentation data length (none) * @@ -829,19 +850,31 @@ static void elf_init_ehframe(ELFObjectContext* ctx) { DWRF_U32(0); // CIE ID (0 indicates this is a CIE) DWRF_U8(DWRF_CIE_VERSION); // CIE version (1) DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA) - DWRF_UV(1); // Code alignment factor +#ifdef __x86_64__ + DWRF_UV(1); // Code alignment factor (x86_64: 1 byte) +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + DWRF_UV(4); // Code alignment factor (AArch64: 4 bytes per instruction) +#endif DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative) DWRF_U8(DWRF_REG_RA); // Return address register number DWRF_UV(1); // Augmentation data length DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding /* Initial CFI instructions - describe default calling convention */ +#ifdef __x86_64__ + /* x86_64 initial CFI state */ DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) DWRF_UV(DWRF_REG_SP); // CFA = SP register DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved DWRF_UV(1); // At offset 1 from CFA - +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + /* AArch64 initial CFI state */ + DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) + DWRF_UV(DWRF_REG_SP); // CFA = SP register + DWRF_UV(0); // CFA = SP + 0 (AArch64 starts with offset 0) + // No initial register saves in AArch64 CIE +#endif DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary ) @@ -852,11 +885,15 @@ static void elf_init_ehframe(ELFObjectContext* ctx) { * * The FDE describes unwinding information specific to this function. * It references the CIE and provides function-specific CFI instructions. + * + * The PC-relative offset is calculated after the entire EH frame is built + * to ensure accurate positioning relative to the synthesized DSO layout. */ DWRF_SECTION(FDE, DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) - DWRF_U32(-0x30); // Machine code offset relative to .text - DWRF_U32(ctx->code_size); // Address range covered by this FDE (code lenght) + ctx->fde_p = p; // Remember where PC offset field is located for later calculation + DWRF_U32(0); // Placeholder for PC-relative offset (calculated at end of elf_init_ehframe) + DWRF_U32(ctx->code_size); // Address range covered by this FDE (code length) DWRF_U8(0); // Augmentation data length (none) /* @@ -867,32 +904,36 @@ static void elf_init_ehframe(ELFObjectContext* ctx) { * conventions and register usage patterns. */ #ifdef __x86_64__ - /* x86_64 calling convention unwinding rules */ + /* x86_64 calling convention unwinding rules with frame pointer */ # if defined(__CET__) && (__CET__ & 1) - DWRF_U8(DWRF_CFA_advance_loc | 8); // Advance location by 8 bytes when CET protection is enabled -# else - DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance location by 4 bytes + DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance past endbr64 (4 bytes) # endif - DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset + DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance past push %rbp (1 byte) + DWRF_U8(DWRF_CFA_def_cfa_offset); // def_cfa_offset 16 DWRF_UV(16); // New offset: SP + 16 - DWRF_U8(DWRF_CFA_advance_loc | 6); // Advance location by 6 bytes - DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset + DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16 + DWRF_UV(2); // Offset factor: 2 * 8 = 16 bytes + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past mov %rsp,%rbp (3 bytes) + DWRF_U8(DWRF_CFA_def_cfa_register); // def_cfa_register r6 + DWRF_UV(DWRF_REG_BP); // Use base pointer register + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3 + DWRF_U8(DWRF_CFA_def_cfa); // def_cfa r7 ofs 8 + DWRF_UV(DWRF_REG_SP); // Use stack pointer register DWRF_UV(8); // New offset: SP + 8 #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) /* AArch64 calling convention unwinding rules */ - DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 instruction (stp x29, x30) - DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset - DWRF_UV(16); // CFA = SP + 16 (stack pointer after push) - DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Frame pointer (x29) saved - DWRF_UV(2); // At offset 2 from CFA (2 * 8 = 16 bytes) - DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Link register (x30) saved - DWRF_UV(1); // At offset 1 from CFA (1 * 8 = 8 bytes) - DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (mov x16, x3; mov x29, sp; ldp...) - DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore frame pointer (x29) - DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore link register (x30) - DWRF_U8(DWRF_CFA_def_cfa_offset); // Final CFA adjustment - DWRF_UV(0); // CFA = SP + 0 (stack restored) - + DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance by 1 instruction (4 bytes) + DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 + DWRF_UV(16); // Stack pointer moved by 16 bytes + DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // x29 (frame pointer) saved + DWRF_UV(2); // At CFA-16 (2 * 8 = 16 bytes from CFA) + DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // x30 (link register) saved + DWRF_UV(1); // At CFA-8 (1 * 8 = 8 bytes from CFA) + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (12 bytes) + DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA); // Restore x30 - NO DWRF_UV() after this! + DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP); // Restore x29 - NO DWRF_UV() after this! + DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 0 (stack restored) + DWRF_UV(0); // Back to original stack position #else # error "Unsupported target architecture" #endif @@ -901,6 +942,58 @@ static void elf_init_ehframe(ELFObjectContext* ctx) { ) ctx->p = p; // Update context pointer to end of generated data + + /* Calculate and update the PC-relative offset in the FDE + * + * When perf processes the jitdump, it creates a synthesized DSO with this layout: + * + * Synthesized DSO Memory Layout: + * ┌─────────────────────────────────────────────────────────────┐ < code_start + * │ Code Section │ + * │ (round_up(code_size, 8) bytes) │ + * ├─────────────────────────────────────────────────────────────┤ < start of EH frame data + * │ EH Frame Data │ + * │ ┌─────────────────────────────────────────────────────┐ │ + * │ │ CIE data │ │ + * │ └─────────────────────────────────────────────────────┘ │ + * │ ┌─────────────────────────────────────────────────────┐ │ + * │ │ FDE Header: │ │ + * │ │ - CIE offset (4 bytes) │ │ + * │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start + * │ │ - address range (4 bytes) │ │ (this specific field) + * │ │ CFI Instructions... │ │ + * │ └─────────────────────────────────────────────────────┘ │ + * ├─────────────────────────────────────────────────────────────┤ < reference_point + * │ EhFrameHeader │ + * │ (navigation metadata) │ + * └─────────────────────────────────────────────────────────────┘ + * + * The PC offset field in the FDE must contain the distance from itself to code_start: + * + * distance = code_start - fde_pc_field + * + * Where: + * fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame + * code_start_location = reference_point - eh_frame_size - round_up(code_size, 8) + * + * Therefore: + * distance = code_start_location - fde_pc_field_location + * = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame) + * = -rounded_code_size - fde_offset_in_frame + * = -(round_up(code_size, 8) + fde_offset_in_frame) + * + * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field, + * + */ + if (ctx->fde_p != NULL) { + int32_t fde_offset_in_frame = (ctx->fde_p - ctx->startp); + int32_t rounded_code_size = round_up(ctx->code_size, 8); + int32_t pc_relative_offset = -(rounded_code_size + fde_offset_in_frame); + + + // Update the PC-relative offset in the FDE + *(int32_t*)ctx->fde_p = pc_relative_offset; + } } // ============================================================================= @@ -1001,8 +1094,10 @@ static void* perf_map_jit_init(void) { /* Initialize code ID counter */ perf_jit_map_state.code_id = 0; - /* Configure trampoline API with padding information */ - trampoline_api.code_padding = PERF_JIT_CODE_PADDING; + /* Calculate padding size based on actual unwind info requirements */ + size_t eh_frame_size = calculate_eh_frame_size(); + size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; + trampoline_api.code_padding = round_up(unwind_data_size, 16); return &perf_jit_map_state; } @@ -1091,6 +1186,7 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) ctx.code_size = code_size; ctx.startp = ctx.p = (uint8_t*)buffer; + ctx.fde_p = NULL; // Initialize to NULL, will be set when FDE is written /* Generate EH frame (Exception Handling frame) data */ elf_init_ehframe(&ctx); @@ -1109,7 +1205,7 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; /* Verify we don't exceed our padding budget */ - assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING); + assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding); ev2.eh_frame_hdr_size = sizeof(EhFrameHeader); ev2.mapped_size = round_up(ev2.unwind_data_size, 16); // 16-byte alignment @@ -1261,4 +1357,4 @@ _PyPerf_Callbacks _Py_perfmap_jit_callbacks = { &perf_map_jit_fini, // Cleanup function }; -#endif /* PY_HAVE_PERF_TRAMPOLINE */ \ No newline at end of file +#endif /* PY_HAVE_PERF_TRAMPOLINE */ diff --git a/Python/perf_trampoline.c b/Python/perf_trampoline.c index f144f7d436fe68..b1b787fc27892e 100644 --- a/Python/perf_trampoline.c +++ b/Python/perf_trampoline.c @@ -186,6 +186,8 @@ struct code_arena_st { *prev; // Pointer to the arena or NULL if this is the first arena. }; +#define CODE_ALIGNMENT 32 + typedef struct code_arena_st code_arena_t; typedef struct trampoline_api_st trampoline_api_t; @@ -291,7 +293,9 @@ new_code_arena(void) void *start = &_Py_trampoline_func_start; void *end = &_Py_trampoline_func_end; size_t code_size = end - start; - size_t chunk_size = round_up(code_size + trampoline_api.code_padding, 16); + size_t unaligned_size = code_size + trampoline_api.code_padding; + size_t chunk_size = round_up(unaligned_size, CODE_ALIGNMENT); + assert(chunk_size % CODE_ALIGNMENT == 0); // TODO: Check the effect of alignment of the code chunks. Initial investigation // showed that this has no effect on performance in x86-64 or aarch64 and the current // version has the advantage that the unwinder in GDB can unwind across JIT-ed code. @@ -356,7 +360,9 @@ static inline py_trampoline code_arena_new_code(code_arena_t *code_arena) { py_trampoline trampoline = (py_trampoline)code_arena->current_addr; - size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding, 16); + size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding, + CODE_ALIGNMENT); + assert(total_code_size % CODE_ALIGNMENT == 0); code_arena->size_left -= total_code_size; code_arena->current_addr += total_code_size; return trampoline; @@ -489,9 +495,6 @@ _PyPerfTrampoline_Init(int activate) } else { tstate->interp->eval_frame = py_trampoline_evaluator; - if (new_code_arena() < 0) { - return -1; - } extra_code_index = _PyEval_RequestCodeExtraIndex(NULL); if (extra_code_index == -1) { return -1; @@ -499,6 +502,9 @@ _PyPerfTrampoline_Init(int activate) if (trampoline_api.state == NULL && trampoline_api.init_state != NULL) { trampoline_api.state = trampoline_api.init_state(); } + if (new_code_arena() < 0) { + return -1; + } perf_status = PERF_STATUS_OK; } #endif _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3//lists/python-checkins.python.org Member address: arch...@mail-archive.com