https://github.com/python/cpython/commit/3d8c38f6db0fea7845aafb92fe6bc795b536a367 commit: 3d8c38f6db0fea7845aafb92fe6bc795b536a367 branch: main author: Brandt Bucher <brandtbuc...@microsoft.com> committer: brandtbucher <brandtbuc...@gmail.com> date: 2025-07-14T10:14:20-07:00 summary:
GH-135904: Improve the JIT's performance on macOS (GH-136528) files: M Python/jit.c M Tools/jit/_optimizers.py M Tools/jit/_targets.py M Tools/jit/jit.h M Tools/jit/shim.c M Tools/jit/template.c diff --git a/Python/jit.c b/Python/jit.c index e232cc1f7d9250..01bc0076497c6d 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -431,8 +431,10 @@ void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *s #if defined(__aarch64__) || defined(_M_ARM64) #define TRAMPOLINE_SIZE 16 + #define DATA_ALIGN 8 #else #define TRAMPOLINE_SIZE 0 + #define DATA_ALIGN 1 #endif // Generate and patch AArch64 trampolines. The symbols to jump to are stored @@ -522,8 +524,9 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz // Round up to the nearest page: size_t page_size = get_page_size(); assert((page_size & (page_size - 1)) == 0); - size_t padding = page_size - ((code_size + state.trampolines.size + data_size) & (page_size - 1)); - size_t total_size = code_size + state.trampolines.size + data_size + padding; + size_t code_padding = DATA_ALIGN - ((code_size + state.trampolines.size) & (DATA_ALIGN - 1)); + size_t padding = page_size - ((code_size + state.trampolines.size + code_padding + data_size) & (page_size - 1)); + size_t total_size = code_size + state.trampolines.size + code_padding + data_size + padding; unsigned char *memory = jit_alloc(total_size); if (memory == NULL) { return -1; @@ -545,7 +548,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz // Loop again to emit the code: unsigned char *code = memory; state.trampolines.mem = memory + code_size; - unsigned char *data = memory + code_size + state.trampolines.size; + unsigned char *data = memory + code_size + state.trampolines.size + code_padding; // Compile the shim, which handles converting between the native // calling convention and the calling convention used by jitted code // (which may be different for efficiency reasons). @@ -567,7 +570,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz code += group->code_size; data += group->data_size; assert(code == memory + code_size); - assert(data == memory + code_size + state.trampolines.size + data_size); + assert(data == memory + code_size + state.trampolines.size + code_padding + data_size); #ifdef MAP_JIT pthread_jit_write_protect_np(1); #endif diff --git a/Tools/jit/_optimizers.py b/Tools/jit/_optimizers.py index 1077e4106fdfbd..33db110b728dba 100644 --- a/Tools/jit/_optimizers.py +++ b/Tools/jit/_optimizers.py @@ -70,21 +70,21 @@ class Optimizer: path: pathlib.Path _: dataclasses.KW_ONLY - # prefix used to mangle symbols on some platforms: - prefix: str = "" + # Prefixes used to mangle local labels and symbols: + label_prefix: str + symbol_prefix: str # The first block in the linked list: _root: _Block = dataclasses.field(init=False, default_factory=_Block) _labels: dict[str, _Block] = dataclasses.field(init=False, default_factory=dict) # No groups: _re_noninstructions: typing.ClassVar[re.Pattern[str]] = re.compile( - r"\s*(?:\.|#|//|$)" + r"\s*(?:\.|#|//|;|$)" ) # One group (label): _re_label: typing.ClassVar[re.Pattern[str]] = re.compile( r'\s*(?P<label>[\w."$?@]+):' ) # Override everything that follows in subclasses: - _alignment: typing.ClassVar[int] = 1 _branches: typing.ClassVar[dict[str, str | None]] = {} # Two groups (instruction and target): _re_branch: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH @@ -131,8 +131,12 @@ def __post_init__(self) -> None: block.fallthrough = False def _preprocess(self, text: str) -> str: - # Override this method to do preprocessing of the textual assembly: - return text + # Override this method to do preprocessing of the textual assembly. + # In all cases, replace references to the _JIT_CONTINUE symbol with + # references to a local _JIT_CONTINUE label (which we will add later): + continue_symbol = rf"\b{re.escape(self.symbol_prefix)}_JIT_CONTINUE\b" + continue_label = f"{self.label_prefix}_JIT_CONTINUE" + return re.sub(continue_symbol, continue_label, text) @classmethod def _invert_branch(cls, line: str, target: str) -> str | None: @@ -197,15 +201,12 @@ def _insert_continue_label(self) -> None: # jmp FOO # After: # jmp FOO - # .balign 8 # _JIT_CONTINUE: # This lets the assembler encode _JIT_CONTINUE jumps at build time! - align = _Block() - align.noninstructions.append(f"\t.balign\t{self._alignment}") - continuation = self._lookup_label(f"{self.prefix}_JIT_CONTINUE") + continuation = self._lookup_label(f"{self.label_prefix}_JIT_CONTINUE") assert continuation.label continuation.noninstructions.append(f"{continuation.label}:") - end.link, align.link, continuation.link = align, continuation, end.link + end.link, continuation.link = continuation, end.link def _mark_hot_blocks(self) -> None: # Start with the last block, and perform a DFS to find all blocks that @@ -285,8 +286,6 @@ def run(self) -> None: class OptimizerAArch64(Optimizer): # pylint: disable = too-few-public-methods """aarch64-apple-darwin/aarch64-pc-windows-msvc/aarch64-unknown-linux-gnu""" - # TODO: @diegorusso - _alignment = 8 # https://developer.arm.com/documentation/ddi0602/2025-03/Base-Instructions/B--Branch- _re_jump = re.compile(r"\s*b\s+(?P<target>[\w.]+)") @@ -302,18 +301,3 @@ class OptimizerX86(Optimizer): # pylint: disable = too-few-public-methods _re_jump = re.compile(r"\s*jmp\s+(?P<target>[\w.]+)") # https://www.felixcloutier.com/x86/ret _re_return = re.compile(r"\s*ret\b") - - -class OptimizerX8664Windows(OptimizerX86): # pylint: disable = too-few-public-methods - """x86_64-pc-windows-msvc""" - - def _preprocess(self, text: str) -> str: - text = super()._preprocess(text) - # Before: - # rex64 jmpq *__imp__JIT_CONTINUE(%rip) - # After: - # jmp _JIT_CONTINUE - far_indirect_jump = ( - rf"rex64\s+jmpq\s+\*__imp_(?P<target>{self.prefix}_JIT_\w+)\(%rip\)" - ) - return re.sub(far_indirect_jump, r"jmp\t\g<target>", text) diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 728f48128ce79c..3883671e92aa39 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -44,7 +44,8 @@ class _Target(typing.Generic[_S, _R]): _: dataclasses.KW_ONLY args: typing.Sequence[str] = () optimizer: type[_optimizers.Optimizer] = _optimizers.Optimizer - prefix: str = "" + label_prefix: typing.ClassVar[str] + symbol_prefix: typing.ClassVar[str] stable: bool = False debug: bool = False verbose: bool = False @@ -172,7 +173,9 @@ async def _compile( *shlex.split(self.cflags), ] await _llvm.run("clang", args_s, echo=self.verbose) - self.optimizer(s, prefix=self.prefix).run() + self.optimizer( + s, label_prefix=self.label_prefix, symbol_prefix=self.symbol_prefix + ).run() args_o = [f"--target={self.triple}", "-c", "-o", f"{o}", f"{s}"] await _llvm.run("clang", args_o, echo=self.verbose) return await self._parse(o) @@ -274,7 +277,7 @@ def _handle_section( symbol = wrapped_symbol["Symbol"] offset = base + symbol["Value"] name = symbol["Name"] - name = name.removeprefix(self.prefix) + name = name.removeprefix(self.symbol_prefix) if name not in group.symbols: group.symbols[name] = value, offset for wrapped_relocation in section["Relocations"]: @@ -285,9 +288,9 @@ def _handle_section( def _unwrap_dllimport(self, name: str) -> tuple[_stencils.HoleValue, str | None]: if name.startswith("__imp_"): name = name.removeprefix("__imp_") - name = name.removeprefix(self.prefix) + name = name.removeprefix(self.symbol_prefix) return _stencils.HoleValue.GOT, name - name = name.removeprefix(self.prefix) + name = name.removeprefix(self.symbol_prefix) return _stencils.symbol_to_value(name) def _handle_relocation( @@ -335,9 +338,24 @@ def _handle_relocation( return _stencils.Hole(offset, kind, value, symbol, addend) +class _COFF32(_COFF): + # These mangle like Mach-O and other "older" formats: + label_prefix = "L" + symbol_prefix = "_" + + +class _COFF64(_COFF): + # These mangle like ELF and other "newer" formats: + label_prefix = ".L" + symbol_prefix = "" + + class _ELF( _Target[_schema.ELFSection, _schema.ELFRelocation] ): # pylint: disable = too-few-public-methods + label_prefix = ".L" + symbol_prefix = "" + def _handle_section( self, section: _schema.ELFSection, group: _stencils.StencilGroup ) -> None: @@ -374,7 +392,7 @@ def _handle_section( symbol = wrapped_symbol["Symbol"] offset = len(stencil.body) + symbol["Value"] name = symbol["Name"]["Name"] - name = name.removeprefix(self.prefix) + name = name.removeprefix(self.symbol_prefix) group.symbols[name] = value, offset stencil.body.extend(section["SectionData"]["Bytes"]) assert not section["Relocations"] @@ -409,7 +427,7 @@ def _handle_relocation( }, }: offset += base - s = s.removeprefix(self.prefix) + s = s.removeprefix(self.symbol_prefix) value, symbol = _stencils.HoleValue.GOT, s case { "Addend": addend, @@ -418,7 +436,7 @@ def _handle_relocation( "Type": {"Name": kind}, }: offset += base - s = s.removeprefix(self.prefix) + s = s.removeprefix(self.symbol_prefix) value, symbol = _stencils.symbol_to_value(s) case _: raise NotImplementedError(relocation) @@ -428,6 +446,9 @@ def _handle_relocation( class _MachO( _Target[_schema.MachOSection, _schema.MachORelocation] ): # pylint: disable = too-few-public-methods + label_prefix = "L" + symbol_prefix = "_" + def _handle_section( self, section: _schema.MachOSection, group: _stencils.StencilGroup ) -> None: @@ -435,10 +456,10 @@ def _handle_section( assert "SectionData" in section flags = {flag["Name"] for flag in section["Attributes"]["Flags"]} name = section["Name"]["Value"] - name = name.removeprefix(self.prefix) + name = name.removeprefix(self.symbol_prefix) if "Debug" in flags: return - if "SomeInstructions" in flags: + if "PureInstructions" in flags: value = _stencils.HoleValue.CODE stencil = group.code start_address = 0 @@ -459,7 +480,7 @@ def _handle_section( symbol = wrapped_symbol["Symbol"] offset = symbol["Value"] - start_address name = symbol["Name"]["Name"] - name = name.removeprefix(self.prefix) + name = name.removeprefix(self.symbol_prefix) group.symbols[name] = value, offset assert "Relocations" in section for wrapped_relocation in section["Relocations"]: @@ -484,7 +505,7 @@ def _handle_relocation( }, }: offset += base - s = s.removeprefix(self.prefix) + s = s.removeprefix(self.symbol_prefix) value, symbol = _stencils.HoleValue.GOT, s addend = 0 case { @@ -493,7 +514,7 @@ def _handle_relocation( "Type": {"Name": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD" as kind}, }: offset += base - s = s.removeprefix(self.prefix) + s = s.removeprefix(self.symbol_prefix) value, symbol = _stencils.HoleValue.GOT, s addend = ( int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 @@ -508,7 +529,7 @@ def _handle_relocation( "Type": {"Name": "X86_64_RELOC_BRANCH" | "X86_64_RELOC_SIGNED" as kind}, }: offset += base - s = s.removeprefix(self.prefix) + s = s.removeprefix(self.symbol_prefix) value, symbol = _stencils.symbol_to_value(s) addend = ( int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 @@ -523,7 +544,7 @@ def _handle_relocation( "Type": {"Name": kind}, }: offset += base - s = s.removeprefix(self.prefix) + s = s.removeprefix(self.symbol_prefix) value, symbol = _stencils.symbol_to_value(s) addend = 0 case _: @@ -531,19 +552,19 @@ def _handle_relocation( return _stencils.Hole(offset, kind, value, symbol, addend) -def get_target(host: str) -> _COFF | _ELF | _MachO: +def get_target(host: str) -> _COFF32 | _COFF64 | _ELF | _MachO: """Build a _Target for the given host "triple" and options.""" optimizer: type[_optimizers.Optimizer] - target: _COFF | _ELF | _MachO + target: _COFF32 | _COFF64 | _ELF | _MachO if re.fullmatch(r"aarch64-apple-darwin.*", host): condition = "defined(__aarch64__) && defined(__APPLE__)" optimizer = _optimizers.OptimizerAArch64 - target = _MachO(host, condition, optimizer=optimizer, prefix="_") + target = _MachO(host, condition, optimizer=optimizer) elif re.fullmatch(r"aarch64-pc-windows-msvc", host): args = ["-fms-runtime-lib=dll", "-fplt"] condition = "defined(_M_ARM64)" optimizer = _optimizers.OptimizerAArch64 - target = _COFF(host, condition, args=args, optimizer=optimizer) + target = _COFF64(host, condition, args=args, optimizer=optimizer) elif re.fullmatch(r"aarch64-.*-linux-gnu", host): # -mno-outline-atomics: Keep intrinsics from being emitted. args = ["-fpic", "-mno-outline-atomics"] @@ -555,16 +576,16 @@ def get_target(host: str) -> _COFF | _ELF | _MachO: args = ["-DPy_NO_ENABLE_SHARED", "-Wno-ignored-attributes"] optimizer = _optimizers.OptimizerX86 condition = "defined(_M_IX86)" - target = _COFF(host, condition, args=args, optimizer=optimizer, prefix="_") + target = _COFF32(host, condition, args=args, optimizer=optimizer) elif re.fullmatch(r"x86_64-apple-darwin.*", host): condition = "defined(__x86_64__) && defined(__APPLE__)" optimizer = _optimizers.OptimizerX86 - target = _MachO(host, condition, optimizer=optimizer, prefix="_") + target = _MachO(host, condition, optimizer=optimizer) elif re.fullmatch(r"x86_64-pc-windows-msvc", host): args = ["-fms-runtime-lib=dll"] condition = "defined(_M_X64)" - optimizer = _optimizers.OptimizerX8664Windows - target = _COFF(host, condition, args=args, optimizer=optimizer) + optimizer = _optimizers.OptimizerX86 + target = _COFF64(host, condition, args=args, optimizer=optimizer) elif re.fullmatch(r"x86_64-.*-linux-gnu", host): args = ["-fno-pic", "-mcmodel=medium", "-mlarge-data-threshold=0"] condition = "defined(__x86_64__) && defined(__linux__)" diff --git a/Tools/jit/jit.h b/Tools/jit/jit.h index f767ef68127eb7..10829654eabb38 100644 --- a/Tools/jit/jit.h +++ b/Tools/jit/jit.h @@ -6,3 +6,7 @@ typedef jit_func __attribute__((preserve_none)) jit_func_preserve_none; #define PATCH_VALUE(TYPE, NAME, ALIAS) \ PyAPI_DATA(void) ALIAS; \ TYPE NAME = (TYPE)(uintptr_t)&ALIAS; + +#define DECLARE_TARGET(NAME) \ + _Py_CODEUNIT *__attribute__((preserve_none, visibility("hidden"))) \ + NAME(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate); diff --git a/Tools/jit/shim.c b/Tools/jit/shim.c index ebd4e9bc858b73..0c7feb746c9679 100644 --- a/Tools/jit/shim.c +++ b/Tools/jit/shim.c @@ -10,6 +10,6 @@ _Py_CODEUNIT * _JIT_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate) { // Note that this is *not* a tail call: - PATCH_VALUE(jit_func_preserve_none, call, _JIT_CONTINUE); - return call(frame, stack_pointer, tstate); + DECLARE_TARGET(_JIT_CONTINUE); + return _JIT_CONTINUE(frame, stack_pointer, tstate); } diff --git a/Tools/jit/template.c b/Tools/jit/template.c index 5ee26f93f1e266..d07f56e9ce6b42 100644 --- a/Tools/jit/template.c +++ b/Tools/jit/template.c @@ -74,10 +74,10 @@ do { \ do { \ } while (0) -#define PATCH_JUMP(ALIAS) \ -do { \ - PATCH_VALUE(jit_func_preserve_none, jump, ALIAS); \ - __attribute__((musttail)) return jump(frame, stack_pointer, tstate); \ +#define PATCH_JUMP(ALIAS) \ +do { \ + DECLARE_TARGET(ALIAS); \ + __attribute__((musttail)) return ALIAS(frame, stack_pointer, tstate); \ } while (0) #undef JUMP_TO_JUMP_TARGET _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3//lists/python-checkins.python.org Member address: arch...@mail-archive.com