https://github.com/python/cpython/commit/3d8c38f6db0fea7845aafb92fe6bc795b536a367
commit: 3d8c38f6db0fea7845aafb92fe6bc795b536a367
branch: main
author: Brandt Bucher <brandtbuc...@microsoft.com>
committer: brandtbucher <brandtbuc...@gmail.com>
date: 2025-07-14T10:14:20-07:00
summary:

GH-135904: Improve the JIT's performance on macOS (GH-136528)

files:
M Python/jit.c
M Tools/jit/_optimizers.py
M Tools/jit/_targets.py
M Tools/jit/jit.h
M Tools/jit/shim.c
M Tools/jit/template.c

diff --git a/Python/jit.c b/Python/jit.c
index e232cc1f7d9250..01bc0076497c6d 100644
--- a/Python/jit.c
+++ b/Python/jit.c
@@ -431,8 +431,10 @@ void patch_aarch64_trampoline(unsigned char *location, int 
ordinal, jit_state *s
 
 #if defined(__aarch64__) || defined(_M_ARM64)
     #define TRAMPOLINE_SIZE 16
+    #define DATA_ALIGN 8
 #else
     #define TRAMPOLINE_SIZE 0
+    #define DATA_ALIGN 1
 #endif
 
 // Generate and patch AArch64 trampolines. The symbols to jump to are stored
@@ -522,8 +524,9 @@ _PyJIT_Compile(_PyExecutorObject *executor, const 
_PyUOpInstruction trace[], siz
     // Round up to the nearest page:
     size_t page_size = get_page_size();
     assert((page_size & (page_size - 1)) == 0);
-    size_t padding = page_size - ((code_size + state.trampolines.size + 
data_size) & (page_size - 1));
-    size_t total_size = code_size + state.trampolines.size + data_size  + 
padding;
+    size_t code_padding = DATA_ALIGN - ((code_size + state.trampolines.size) & 
(DATA_ALIGN - 1));
+    size_t padding = page_size - ((code_size + state.trampolines.size + 
code_padding + data_size) & (page_size - 1));
+    size_t total_size = code_size + state.trampolines.size + code_padding + 
data_size + padding;
     unsigned char *memory = jit_alloc(total_size);
     if (memory == NULL) {
         return -1;
@@ -545,7 +548,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const 
_PyUOpInstruction trace[], siz
     // Loop again to emit the code:
     unsigned char *code = memory;
     state.trampolines.mem = memory + code_size;
-    unsigned char *data = memory + code_size + state.trampolines.size;
+    unsigned char *data = memory + code_size + state.trampolines.size + 
code_padding;
     // Compile the shim, which handles converting between the native
     // calling convention and the calling convention used by jitted code
     // (which may be different for efficiency reasons).
@@ -567,7 +570,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const 
_PyUOpInstruction trace[], siz
     code += group->code_size;
     data += group->data_size;
     assert(code == memory + code_size);
-    assert(data == memory + code_size + state.trampolines.size + data_size);
+    assert(data == memory + code_size + state.trampolines.size + code_padding 
+ data_size);
 #ifdef MAP_JIT
     pthread_jit_write_protect_np(1);
 #endif
diff --git a/Tools/jit/_optimizers.py b/Tools/jit/_optimizers.py
index 1077e4106fdfbd..33db110b728dba 100644
--- a/Tools/jit/_optimizers.py
+++ b/Tools/jit/_optimizers.py
@@ -70,21 +70,21 @@ class Optimizer:
 
     path: pathlib.Path
     _: dataclasses.KW_ONLY
-    # prefix used to mangle symbols on some platforms:
-    prefix: str = ""
+    # Prefixes used to mangle local labels and symbols:
+    label_prefix: str
+    symbol_prefix: str
     # The first block in the linked list:
     _root: _Block = dataclasses.field(init=False, default_factory=_Block)
     _labels: dict[str, _Block] = dataclasses.field(init=False, 
default_factory=dict)
     # No groups:
     _re_noninstructions: typing.ClassVar[re.Pattern[str]] = re.compile(
-        r"\s*(?:\.|#|//|$)"
+        r"\s*(?:\.|#|//|;|$)"
     )
     # One group (label):
     _re_label: typing.ClassVar[re.Pattern[str]] = re.compile(
         r'\s*(?P<label>[\w."$?@]+):'
     )
     # Override everything that follows in subclasses:
-    _alignment: typing.ClassVar[int] = 1
     _branches: typing.ClassVar[dict[str, str | None]] = {}
     # Two groups (instruction and target):
     _re_branch: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH
@@ -131,8 +131,12 @@ def __post_init__(self) -> None:
                 block.fallthrough = False
 
     def _preprocess(self, text: str) -> str:
-        # Override this method to do preprocessing of the textual assembly:
-        return text
+        # Override this method to do preprocessing of the textual assembly.
+        # In all cases, replace references to the _JIT_CONTINUE symbol with
+        # references to a local _JIT_CONTINUE label (which we will add later):
+        continue_symbol = rf"\b{re.escape(self.symbol_prefix)}_JIT_CONTINUE\b"
+        continue_label = f"{self.label_prefix}_JIT_CONTINUE"
+        return re.sub(continue_symbol, continue_label, text)
 
     @classmethod
     def _invert_branch(cls, line: str, target: str) -> str | None:
@@ -197,15 +201,12 @@ def _insert_continue_label(self) -> None:
         #    jmp FOO
         # After:
         #    jmp FOO
-        #    .balign 8
         #    _JIT_CONTINUE:
         # This lets the assembler encode _JIT_CONTINUE jumps at build time!
-        align = _Block()
-        align.noninstructions.append(f"\t.balign\t{self._alignment}")
-        continuation = self._lookup_label(f"{self.prefix}_JIT_CONTINUE")
+        continuation = self._lookup_label(f"{self.label_prefix}_JIT_CONTINUE")
         assert continuation.label
         continuation.noninstructions.append(f"{continuation.label}:")
-        end.link, align.link, continuation.link = align, continuation, end.link
+        end.link, continuation.link = continuation, end.link
 
     def _mark_hot_blocks(self) -> None:
         # Start with the last block, and perform a DFS to find all blocks that
@@ -285,8 +286,6 @@ def run(self) -> None:
 class OptimizerAArch64(Optimizer):  # pylint: disable = too-few-public-methods
     
"""aarch64-apple-darwin/aarch64-pc-windows-msvc/aarch64-unknown-linux-gnu"""
 
-    # TODO: @diegorusso
-    _alignment = 8
     # 
https://developer.arm.com/documentation/ddi0602/2025-03/Base-Instructions/B--Branch-
     _re_jump = re.compile(r"\s*b\s+(?P<target>[\w.]+)")
 
@@ -302,18 +301,3 @@ class OptimizerX86(Optimizer):  # pylint: disable = 
too-few-public-methods
     _re_jump = re.compile(r"\s*jmp\s+(?P<target>[\w.]+)")
     # https://www.felixcloutier.com/x86/ret
     _re_return = re.compile(r"\s*ret\b")
-
-
-class OptimizerX8664Windows(OptimizerX86):  # pylint: disable = 
too-few-public-methods
-    """x86_64-pc-windows-msvc"""
-
-    def _preprocess(self, text: str) -> str:
-        text = super()._preprocess(text)
-        # Before:
-        #     rex64 jmpq *__imp__JIT_CONTINUE(%rip)
-        # After:
-        #     jmp _JIT_CONTINUE
-        far_indirect_jump = (
-            
rf"rex64\s+jmpq\s+\*__imp_(?P<target>{self.prefix}_JIT_\w+)\(%rip\)"
-        )
-        return re.sub(far_indirect_jump, r"jmp\t\g<target>", text)
diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py
index 728f48128ce79c..3883671e92aa39 100644
--- a/Tools/jit/_targets.py
+++ b/Tools/jit/_targets.py
@@ -44,7 +44,8 @@ class _Target(typing.Generic[_S, _R]):
     _: dataclasses.KW_ONLY
     args: typing.Sequence[str] = ()
     optimizer: type[_optimizers.Optimizer] = _optimizers.Optimizer
-    prefix: str = ""
+    label_prefix: typing.ClassVar[str]
+    symbol_prefix: typing.ClassVar[str]
     stable: bool = False
     debug: bool = False
     verbose: bool = False
@@ -172,7 +173,9 @@ async def _compile(
             *shlex.split(self.cflags),
         ]
         await _llvm.run("clang", args_s, echo=self.verbose)
-        self.optimizer(s, prefix=self.prefix).run()
+        self.optimizer(
+            s, label_prefix=self.label_prefix, symbol_prefix=self.symbol_prefix
+        ).run()
         args_o = [f"--target={self.triple}", "-c", "-o", f"{o}", f"{s}"]
         await _llvm.run("clang", args_o, echo=self.verbose)
         return await self._parse(o)
@@ -274,7 +277,7 @@ def _handle_section(
             symbol = wrapped_symbol["Symbol"]
             offset = base + symbol["Value"]
             name = symbol["Name"]
-            name = name.removeprefix(self.prefix)
+            name = name.removeprefix(self.symbol_prefix)
             if name not in group.symbols:
                 group.symbols[name] = value, offset
         for wrapped_relocation in section["Relocations"]:
@@ -285,9 +288,9 @@ def _handle_section(
     def _unwrap_dllimport(self, name: str) -> tuple[_stencils.HoleValue, str | 
None]:
         if name.startswith("__imp_"):
             name = name.removeprefix("__imp_")
-            name = name.removeprefix(self.prefix)
+            name = name.removeprefix(self.symbol_prefix)
             return _stencils.HoleValue.GOT, name
-        name = name.removeprefix(self.prefix)
+        name = name.removeprefix(self.symbol_prefix)
         return _stencils.symbol_to_value(name)
 
     def _handle_relocation(
@@ -335,9 +338,24 @@ def _handle_relocation(
         return _stencils.Hole(offset, kind, value, symbol, addend)
 
 
+class _COFF32(_COFF):
+    # These mangle like Mach-O and other "older" formats:
+    label_prefix = "L"
+    symbol_prefix = "_"
+
+
+class _COFF64(_COFF):
+    # These mangle like ELF and other "newer" formats:
+    label_prefix = ".L"
+    symbol_prefix = ""
+
+
 class _ELF(
     _Target[_schema.ELFSection, _schema.ELFRelocation]
 ):  # pylint: disable = too-few-public-methods
+    label_prefix = ".L"
+    symbol_prefix = ""
+
     def _handle_section(
         self, section: _schema.ELFSection, group: _stencils.StencilGroup
     ) -> None:
@@ -374,7 +392,7 @@ def _handle_section(
                 symbol = wrapped_symbol["Symbol"]
                 offset = len(stencil.body) + symbol["Value"]
                 name = symbol["Name"]["Name"]
-                name = name.removeprefix(self.prefix)
+                name = name.removeprefix(self.symbol_prefix)
                 group.symbols[name] = value, offset
             stencil.body.extend(section["SectionData"]["Bytes"])
             assert not section["Relocations"]
@@ -409,7 +427,7 @@ def _handle_relocation(
                 },
             }:
                 offset += base
-                s = s.removeprefix(self.prefix)
+                s = s.removeprefix(self.symbol_prefix)
                 value, symbol = _stencils.HoleValue.GOT, s
             case {
                 "Addend": addend,
@@ -418,7 +436,7 @@ def _handle_relocation(
                 "Type": {"Name": kind},
             }:
                 offset += base
-                s = s.removeprefix(self.prefix)
+                s = s.removeprefix(self.symbol_prefix)
                 value, symbol = _stencils.symbol_to_value(s)
             case _:
                 raise NotImplementedError(relocation)
@@ -428,6 +446,9 @@ def _handle_relocation(
 class _MachO(
     _Target[_schema.MachOSection, _schema.MachORelocation]
 ):  # pylint: disable = too-few-public-methods
+    label_prefix = "L"
+    symbol_prefix = "_"
+
     def _handle_section(
         self, section: _schema.MachOSection, group: _stencils.StencilGroup
     ) -> None:
@@ -435,10 +456,10 @@ def _handle_section(
         assert "SectionData" in section
         flags = {flag["Name"] for flag in section["Attributes"]["Flags"]}
         name = section["Name"]["Value"]
-        name = name.removeprefix(self.prefix)
+        name = name.removeprefix(self.symbol_prefix)
         if "Debug" in flags:
             return
-        if "SomeInstructions" in flags:
+        if "PureInstructions" in flags:
             value = _stencils.HoleValue.CODE
             stencil = group.code
             start_address = 0
@@ -459,7 +480,7 @@ def _handle_section(
             symbol = wrapped_symbol["Symbol"]
             offset = symbol["Value"] - start_address
             name = symbol["Name"]["Name"]
-            name = name.removeprefix(self.prefix)
+            name = name.removeprefix(self.symbol_prefix)
             group.symbols[name] = value, offset
         assert "Relocations" in section
         for wrapped_relocation in section["Relocations"]:
@@ -484,7 +505,7 @@ def _handle_relocation(
                 },
             }:
                 offset += base
-                s = s.removeprefix(self.prefix)
+                s = s.removeprefix(self.symbol_prefix)
                 value, symbol = _stencils.HoleValue.GOT, s
                 addend = 0
             case {
@@ -493,7 +514,7 @@ def _handle_relocation(
                 "Type": {"Name": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD" 
as kind},
             }:
                 offset += base
-                s = s.removeprefix(self.prefix)
+                s = s.removeprefix(self.symbol_prefix)
                 value, symbol = _stencils.HoleValue.GOT, s
                 addend = (
                     int.from_bytes(raw[offset : offset + 4], "little", 
signed=True) - 4
@@ -508,7 +529,7 @@ def _handle_relocation(
                 "Type": {"Name": "X86_64_RELOC_BRANCH" | "X86_64_RELOC_SIGNED" 
as kind},
             }:
                 offset += base
-                s = s.removeprefix(self.prefix)
+                s = s.removeprefix(self.symbol_prefix)
                 value, symbol = _stencils.symbol_to_value(s)
                 addend = (
                     int.from_bytes(raw[offset : offset + 4], "little", 
signed=True) - 4
@@ -523,7 +544,7 @@ def _handle_relocation(
                 "Type": {"Name": kind},
             }:
                 offset += base
-                s = s.removeprefix(self.prefix)
+                s = s.removeprefix(self.symbol_prefix)
                 value, symbol = _stencils.symbol_to_value(s)
                 addend = 0
             case _:
@@ -531,19 +552,19 @@ def _handle_relocation(
         return _stencils.Hole(offset, kind, value, symbol, addend)
 
 
-def get_target(host: str) -> _COFF | _ELF | _MachO:
+def get_target(host: str) -> _COFF32 | _COFF64 | _ELF | _MachO:
     """Build a _Target for the given host "triple" and options."""
     optimizer: type[_optimizers.Optimizer]
-    target: _COFF | _ELF | _MachO
+    target: _COFF32 | _COFF64 | _ELF | _MachO
     if re.fullmatch(r"aarch64-apple-darwin.*", host):
         condition = "defined(__aarch64__) && defined(__APPLE__)"
         optimizer = _optimizers.OptimizerAArch64
-        target = _MachO(host, condition, optimizer=optimizer, prefix="_")
+        target = _MachO(host, condition, optimizer=optimizer)
     elif re.fullmatch(r"aarch64-pc-windows-msvc", host):
         args = ["-fms-runtime-lib=dll", "-fplt"]
         condition = "defined(_M_ARM64)"
         optimizer = _optimizers.OptimizerAArch64
-        target = _COFF(host, condition, args=args, optimizer=optimizer)
+        target = _COFF64(host, condition, args=args, optimizer=optimizer)
     elif re.fullmatch(r"aarch64-.*-linux-gnu", host):
         # -mno-outline-atomics: Keep intrinsics from being emitted.
         args = ["-fpic", "-mno-outline-atomics"]
@@ -555,16 +576,16 @@ def get_target(host: str) -> _COFF | _ELF | _MachO:
         args = ["-DPy_NO_ENABLE_SHARED", "-Wno-ignored-attributes"]
         optimizer = _optimizers.OptimizerX86
         condition = "defined(_M_IX86)"
-        target = _COFF(host, condition, args=args, optimizer=optimizer, 
prefix="_")
+        target = _COFF32(host, condition, args=args, optimizer=optimizer)
     elif re.fullmatch(r"x86_64-apple-darwin.*", host):
         condition = "defined(__x86_64__) && defined(__APPLE__)"
         optimizer = _optimizers.OptimizerX86
-        target = _MachO(host, condition, optimizer=optimizer, prefix="_")
+        target = _MachO(host, condition, optimizer=optimizer)
     elif re.fullmatch(r"x86_64-pc-windows-msvc", host):
         args = ["-fms-runtime-lib=dll"]
         condition = "defined(_M_X64)"
-        optimizer = _optimizers.OptimizerX8664Windows
-        target = _COFF(host, condition, args=args, optimizer=optimizer)
+        optimizer = _optimizers.OptimizerX86
+        target = _COFF64(host, condition, args=args, optimizer=optimizer)
     elif re.fullmatch(r"x86_64-.*-linux-gnu", host):
         args = ["-fno-pic", "-mcmodel=medium", "-mlarge-data-threshold=0"]
         condition = "defined(__x86_64__) && defined(__linux__)"
diff --git a/Tools/jit/jit.h b/Tools/jit/jit.h
index f767ef68127eb7..10829654eabb38 100644
--- a/Tools/jit/jit.h
+++ b/Tools/jit/jit.h
@@ -6,3 +6,7 @@ typedef jit_func __attribute__((preserve_none)) 
jit_func_preserve_none;
 #define PATCH_VALUE(TYPE, NAME, ALIAS) \
     PyAPI_DATA(void) ALIAS;            \
     TYPE NAME = (TYPE)(uintptr_t)&ALIAS;
+
+#define DECLARE_TARGET(NAME)                     \
+    _Py_CODEUNIT *__attribute__((preserve_none, visibility("hidden"))) \
+    NAME(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState 
*tstate);
diff --git a/Tools/jit/shim.c b/Tools/jit/shim.c
index ebd4e9bc858b73..0c7feb746c9679 100644
--- a/Tools/jit/shim.c
+++ b/Tools/jit/shim.c
@@ -10,6 +10,6 @@ _Py_CODEUNIT *
 _JIT_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, 
PyThreadState *tstate)
 {
     // Note that this is *not* a tail call:
-    PATCH_VALUE(jit_func_preserve_none, call, _JIT_CONTINUE);
-    return call(frame, stack_pointer, tstate);
+    DECLARE_TARGET(_JIT_CONTINUE);
+    return _JIT_CONTINUE(frame, stack_pointer, tstate);
 }
diff --git a/Tools/jit/template.c b/Tools/jit/template.c
index 5ee26f93f1e266..d07f56e9ce6b42 100644
--- a/Tools/jit/template.c
+++ b/Tools/jit/template.c
@@ -74,10 +74,10 @@ do {                                                \
     do {                       \
     } while (0)
 
-#define PATCH_JUMP(ALIAS)                                                \
-do {                                                                     \
-    PATCH_VALUE(jit_func_preserve_none, jump, ALIAS);                    \
-    __attribute__((musttail)) return jump(frame, stack_pointer, tstate); \
+#define PATCH_JUMP(ALIAS)                                                 \
+do {                                                                      \
+    DECLARE_TARGET(ALIAS);                                                \
+    __attribute__((musttail)) return ALIAS(frame, stack_pointer, tstate); \
 } while (0)
 
 #undef JUMP_TO_JUMP_TARGET

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-le...@python.org
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: arch...@mail-archive.com

Reply via email to