https://github.com/python/cpython/commit/3d8c38f6db0fea7845aafb92fe6bc795b536a367
commit: 3d8c38f6db0fea7845aafb92fe6bc795b536a367
branch: main
author: Brandt Bucher <[email protected]>
committer: brandtbucher <[email protected]>
date: 2025-07-14T10:14:20-07:00
summary:
GH-135904: Improve the JIT's performance on macOS (GH-136528)
files:
M Python/jit.c
M Tools/jit/_optimizers.py
M Tools/jit/_targets.py
M Tools/jit/jit.h
M Tools/jit/shim.c
M Tools/jit/template.c
diff --git a/Python/jit.c b/Python/jit.c
index e232cc1f7d9250..01bc0076497c6d 100644
--- a/Python/jit.c
+++ b/Python/jit.c
@@ -431,8 +431,10 @@ void patch_aarch64_trampoline(unsigned char *location, int
ordinal, jit_state *s
#if defined(__aarch64__) || defined(_M_ARM64)
#define TRAMPOLINE_SIZE 16
+ #define DATA_ALIGN 8
#else
#define TRAMPOLINE_SIZE 0
+ #define DATA_ALIGN 1
#endif
// Generate and patch AArch64 trampolines. The symbols to jump to are stored
@@ -522,8 +524,9 @@ _PyJIT_Compile(_PyExecutorObject *executor, const
_PyUOpInstruction trace[], siz
// Round up to the nearest page:
size_t page_size = get_page_size();
assert((page_size & (page_size - 1)) == 0);
- size_t padding = page_size - ((code_size + state.trampolines.size +
data_size) & (page_size - 1));
- size_t total_size = code_size + state.trampolines.size + data_size +
padding;
+ size_t code_padding = DATA_ALIGN - ((code_size + state.trampolines.size) &
(DATA_ALIGN - 1));
+ size_t padding = page_size - ((code_size + state.trampolines.size +
code_padding + data_size) & (page_size - 1));
+ size_t total_size = code_size + state.trampolines.size + code_padding +
data_size + padding;
unsigned char *memory = jit_alloc(total_size);
if (memory == NULL) {
return -1;
@@ -545,7 +548,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const
_PyUOpInstruction trace[], siz
// Loop again to emit the code:
unsigned char *code = memory;
state.trampolines.mem = memory + code_size;
- unsigned char *data = memory + code_size + state.trampolines.size;
+ unsigned char *data = memory + code_size + state.trampolines.size +
code_padding;
// Compile the shim, which handles converting between the native
// calling convention and the calling convention used by jitted code
// (which may be different for efficiency reasons).
@@ -567,7 +570,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const
_PyUOpInstruction trace[], siz
code += group->code_size;
data += group->data_size;
assert(code == memory + code_size);
- assert(data == memory + code_size + state.trampolines.size + data_size);
+ assert(data == memory + code_size + state.trampolines.size + code_padding
+ data_size);
#ifdef MAP_JIT
pthread_jit_write_protect_np(1);
#endif
diff --git a/Tools/jit/_optimizers.py b/Tools/jit/_optimizers.py
index 1077e4106fdfbd..33db110b728dba 100644
--- a/Tools/jit/_optimizers.py
+++ b/Tools/jit/_optimizers.py
@@ -70,21 +70,21 @@ class Optimizer:
path: pathlib.Path
_: dataclasses.KW_ONLY
- # prefix used to mangle symbols on some platforms:
- prefix: str = ""
+ # Prefixes used to mangle local labels and symbols:
+ label_prefix: str
+ symbol_prefix: str
# The first block in the linked list:
_root: _Block = dataclasses.field(init=False, default_factory=_Block)
_labels: dict[str, _Block] = dataclasses.field(init=False,
default_factory=dict)
# No groups:
_re_noninstructions: typing.ClassVar[re.Pattern[str]] = re.compile(
- r"\s*(?:\.|#|//|$)"
+ r"\s*(?:\.|#|//|;|$)"
)
# One group (label):
_re_label: typing.ClassVar[re.Pattern[str]] = re.compile(
r'\s*(?P<label>[\w."$?@]+):'
)
# Override everything that follows in subclasses:
- _alignment: typing.ClassVar[int] = 1
_branches: typing.ClassVar[dict[str, str | None]] = {}
# Two groups (instruction and target):
_re_branch: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH
@@ -131,8 +131,12 @@ def __post_init__(self) -> None:
block.fallthrough = False
def _preprocess(self, text: str) -> str:
- # Override this method to do preprocessing of the textual assembly:
- return text
+ # Override this method to do preprocessing of the textual assembly.
+ # In all cases, replace references to the _JIT_CONTINUE symbol with
+ # references to a local _JIT_CONTINUE label (which we will add later):
+ continue_symbol = rf"\b{re.escape(self.symbol_prefix)}_JIT_CONTINUE\b"
+ continue_label = f"{self.label_prefix}_JIT_CONTINUE"
+ return re.sub(continue_symbol, continue_label, text)
@classmethod
def _invert_branch(cls, line: str, target: str) -> str | None:
@@ -197,15 +201,12 @@ def _insert_continue_label(self) -> None:
# jmp FOO
# After:
# jmp FOO
- # .balign 8
# _JIT_CONTINUE:
# This lets the assembler encode _JIT_CONTINUE jumps at build time!
- align = _Block()
- align.noninstructions.append(f"\t.balign\t{self._alignment}")
- continuation = self._lookup_label(f"{self.prefix}_JIT_CONTINUE")
+ continuation = self._lookup_label(f"{self.label_prefix}_JIT_CONTINUE")
assert continuation.label
continuation.noninstructions.append(f"{continuation.label}:")
- end.link, align.link, continuation.link = align, continuation, end.link
+ end.link, continuation.link = continuation, end.link
def _mark_hot_blocks(self) -> None:
# Start with the last block, and perform a DFS to find all blocks that
@@ -285,8 +286,6 @@ def run(self) -> None:
class OptimizerAArch64(Optimizer): # pylint: disable = too-few-public-methods
"""aarch64-apple-darwin/aarch64-pc-windows-msvc/aarch64-unknown-linux-gnu"""
- # TODO: @diegorusso
- _alignment = 8
#
https://developer.arm.com/documentation/ddi0602/2025-03/Base-Instructions/B--Branch-
_re_jump = re.compile(r"\s*b\s+(?P<target>[\w.]+)")
@@ -302,18 +301,3 @@ class OptimizerX86(Optimizer): # pylint: disable =
too-few-public-methods
_re_jump = re.compile(r"\s*jmp\s+(?P<target>[\w.]+)")
# https://www.felixcloutier.com/x86/ret
_re_return = re.compile(r"\s*ret\b")
-
-
-class OptimizerX8664Windows(OptimizerX86): # pylint: disable =
too-few-public-methods
- """x86_64-pc-windows-msvc"""
-
- def _preprocess(self, text: str) -> str:
- text = super()._preprocess(text)
- # Before:
- # rex64 jmpq *__imp__JIT_CONTINUE(%rip)
- # After:
- # jmp _JIT_CONTINUE
- far_indirect_jump = (
-
rf"rex64\s+jmpq\s+\*__imp_(?P<target>{self.prefix}_JIT_\w+)\(%rip\)"
- )
- return re.sub(far_indirect_jump, r"jmp\t\g<target>", text)
diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py
index 728f48128ce79c..3883671e92aa39 100644
--- a/Tools/jit/_targets.py
+++ b/Tools/jit/_targets.py
@@ -44,7 +44,8 @@ class _Target(typing.Generic[_S, _R]):
_: dataclasses.KW_ONLY
args: typing.Sequence[str] = ()
optimizer: type[_optimizers.Optimizer] = _optimizers.Optimizer
- prefix: str = ""
+ label_prefix: typing.ClassVar[str]
+ symbol_prefix: typing.ClassVar[str]
stable: bool = False
debug: bool = False
verbose: bool = False
@@ -172,7 +173,9 @@ async def _compile(
*shlex.split(self.cflags),
]
await _llvm.run("clang", args_s, echo=self.verbose)
- self.optimizer(s, prefix=self.prefix).run()
+ self.optimizer(
+ s, label_prefix=self.label_prefix, symbol_prefix=self.symbol_prefix
+ ).run()
args_o = [f"--target={self.triple}", "-c", "-o", f"{o}", f"{s}"]
await _llvm.run("clang", args_o, echo=self.verbose)
return await self._parse(o)
@@ -274,7 +277,7 @@ def _handle_section(
symbol = wrapped_symbol["Symbol"]
offset = base + symbol["Value"]
name = symbol["Name"]
- name = name.removeprefix(self.prefix)
+ name = name.removeprefix(self.symbol_prefix)
if name not in group.symbols:
group.symbols[name] = value, offset
for wrapped_relocation in section["Relocations"]:
@@ -285,9 +288,9 @@ def _handle_section(
def _unwrap_dllimport(self, name: str) -> tuple[_stencils.HoleValue, str |
None]:
if name.startswith("__imp_"):
name = name.removeprefix("__imp_")
- name = name.removeprefix(self.prefix)
+ name = name.removeprefix(self.symbol_prefix)
return _stencils.HoleValue.GOT, name
- name = name.removeprefix(self.prefix)
+ name = name.removeprefix(self.symbol_prefix)
return _stencils.symbol_to_value(name)
def _handle_relocation(
@@ -335,9 +338,24 @@ def _handle_relocation(
return _stencils.Hole(offset, kind, value, symbol, addend)
+class _COFF32(_COFF):
+ # These mangle like Mach-O and other "older" formats:
+ label_prefix = "L"
+ symbol_prefix = "_"
+
+
+class _COFF64(_COFF):
+ # These mangle like ELF and other "newer" formats:
+ label_prefix = ".L"
+ symbol_prefix = ""
+
+
class _ELF(
_Target[_schema.ELFSection, _schema.ELFRelocation]
): # pylint: disable = too-few-public-methods
+ label_prefix = ".L"
+ symbol_prefix = ""
+
def _handle_section(
self, section: _schema.ELFSection, group: _stencils.StencilGroup
) -> None:
@@ -374,7 +392,7 @@ def _handle_section(
symbol = wrapped_symbol["Symbol"]
offset = len(stencil.body) + symbol["Value"]
name = symbol["Name"]["Name"]
- name = name.removeprefix(self.prefix)
+ name = name.removeprefix(self.symbol_prefix)
group.symbols[name] = value, offset
stencil.body.extend(section["SectionData"]["Bytes"])
assert not section["Relocations"]
@@ -409,7 +427,7 @@ def _handle_relocation(
},
}:
offset += base
- s = s.removeprefix(self.prefix)
+ s = s.removeprefix(self.symbol_prefix)
value, symbol = _stencils.HoleValue.GOT, s
case {
"Addend": addend,
@@ -418,7 +436,7 @@ def _handle_relocation(
"Type": {"Name": kind},
}:
offset += base
- s = s.removeprefix(self.prefix)
+ s = s.removeprefix(self.symbol_prefix)
value, symbol = _stencils.symbol_to_value(s)
case _:
raise NotImplementedError(relocation)
@@ -428,6 +446,9 @@ def _handle_relocation(
class _MachO(
_Target[_schema.MachOSection, _schema.MachORelocation]
): # pylint: disable = too-few-public-methods
+ label_prefix = "L"
+ symbol_prefix = "_"
+
def _handle_section(
self, section: _schema.MachOSection, group: _stencils.StencilGroup
) -> None:
@@ -435,10 +456,10 @@ def _handle_section(
assert "SectionData" in section
flags = {flag["Name"] for flag in section["Attributes"]["Flags"]}
name = section["Name"]["Value"]
- name = name.removeprefix(self.prefix)
+ name = name.removeprefix(self.symbol_prefix)
if "Debug" in flags:
return
- if "SomeInstructions" in flags:
+ if "PureInstructions" in flags:
value = _stencils.HoleValue.CODE
stencil = group.code
start_address = 0
@@ -459,7 +480,7 @@ def _handle_section(
symbol = wrapped_symbol["Symbol"]
offset = symbol["Value"] - start_address
name = symbol["Name"]["Name"]
- name = name.removeprefix(self.prefix)
+ name = name.removeprefix(self.symbol_prefix)
group.symbols[name] = value, offset
assert "Relocations" in section
for wrapped_relocation in section["Relocations"]:
@@ -484,7 +505,7 @@ def _handle_relocation(
},
}:
offset += base
- s = s.removeprefix(self.prefix)
+ s = s.removeprefix(self.symbol_prefix)
value, symbol = _stencils.HoleValue.GOT, s
addend = 0
case {
@@ -493,7 +514,7 @@ def _handle_relocation(
"Type": {"Name": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD"
as kind},
}:
offset += base
- s = s.removeprefix(self.prefix)
+ s = s.removeprefix(self.symbol_prefix)
value, symbol = _stencils.HoleValue.GOT, s
addend = (
int.from_bytes(raw[offset : offset + 4], "little",
signed=True) - 4
@@ -508,7 +529,7 @@ def _handle_relocation(
"Type": {"Name": "X86_64_RELOC_BRANCH" | "X86_64_RELOC_SIGNED"
as kind},
}:
offset += base
- s = s.removeprefix(self.prefix)
+ s = s.removeprefix(self.symbol_prefix)
value, symbol = _stencils.symbol_to_value(s)
addend = (
int.from_bytes(raw[offset : offset + 4], "little",
signed=True) - 4
@@ -523,7 +544,7 @@ def _handle_relocation(
"Type": {"Name": kind},
}:
offset += base
- s = s.removeprefix(self.prefix)
+ s = s.removeprefix(self.symbol_prefix)
value, symbol = _stencils.symbol_to_value(s)
addend = 0
case _:
@@ -531,19 +552,19 @@ def _handle_relocation(
return _stencils.Hole(offset, kind, value, symbol, addend)
-def get_target(host: str) -> _COFF | _ELF | _MachO:
+def get_target(host: str) -> _COFF32 | _COFF64 | _ELF | _MachO:
"""Build a _Target for the given host "triple" and options."""
optimizer: type[_optimizers.Optimizer]
- target: _COFF | _ELF | _MachO
+ target: _COFF32 | _COFF64 | _ELF | _MachO
if re.fullmatch(r"aarch64-apple-darwin.*", host):
condition = "defined(__aarch64__) && defined(__APPLE__)"
optimizer = _optimizers.OptimizerAArch64
- target = _MachO(host, condition, optimizer=optimizer, prefix="_")
+ target = _MachO(host, condition, optimizer=optimizer)
elif re.fullmatch(r"aarch64-pc-windows-msvc", host):
args = ["-fms-runtime-lib=dll", "-fplt"]
condition = "defined(_M_ARM64)"
optimizer = _optimizers.OptimizerAArch64
- target = _COFF(host, condition, args=args, optimizer=optimizer)
+ target = _COFF64(host, condition, args=args, optimizer=optimizer)
elif re.fullmatch(r"aarch64-.*-linux-gnu", host):
# -mno-outline-atomics: Keep intrinsics from being emitted.
args = ["-fpic", "-mno-outline-atomics"]
@@ -555,16 +576,16 @@ def get_target(host: str) -> _COFF | _ELF | _MachO:
args = ["-DPy_NO_ENABLE_SHARED", "-Wno-ignored-attributes"]
optimizer = _optimizers.OptimizerX86
condition = "defined(_M_IX86)"
- target = _COFF(host, condition, args=args, optimizer=optimizer,
prefix="_")
+ target = _COFF32(host, condition, args=args, optimizer=optimizer)
elif re.fullmatch(r"x86_64-apple-darwin.*", host):
condition = "defined(__x86_64__) && defined(__APPLE__)"
optimizer = _optimizers.OptimizerX86
- target = _MachO(host, condition, optimizer=optimizer, prefix="_")
+ target = _MachO(host, condition, optimizer=optimizer)
elif re.fullmatch(r"x86_64-pc-windows-msvc", host):
args = ["-fms-runtime-lib=dll"]
condition = "defined(_M_X64)"
- optimizer = _optimizers.OptimizerX8664Windows
- target = _COFF(host, condition, args=args, optimizer=optimizer)
+ optimizer = _optimizers.OptimizerX86
+ target = _COFF64(host, condition, args=args, optimizer=optimizer)
elif re.fullmatch(r"x86_64-.*-linux-gnu", host):
args = ["-fno-pic", "-mcmodel=medium", "-mlarge-data-threshold=0"]
condition = "defined(__x86_64__) && defined(__linux__)"
diff --git a/Tools/jit/jit.h b/Tools/jit/jit.h
index f767ef68127eb7..10829654eabb38 100644
--- a/Tools/jit/jit.h
+++ b/Tools/jit/jit.h
@@ -6,3 +6,7 @@ typedef jit_func __attribute__((preserve_none))
jit_func_preserve_none;
#define PATCH_VALUE(TYPE, NAME, ALIAS) \
PyAPI_DATA(void) ALIAS; \
TYPE NAME = (TYPE)(uintptr_t)&ALIAS;
+
+#define DECLARE_TARGET(NAME) \
+ _Py_CODEUNIT *__attribute__((preserve_none, visibility("hidden"))) \
+ NAME(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState
*tstate);
diff --git a/Tools/jit/shim.c b/Tools/jit/shim.c
index ebd4e9bc858b73..0c7feb746c9679 100644
--- a/Tools/jit/shim.c
+++ b/Tools/jit/shim.c
@@ -10,6 +10,6 @@ _Py_CODEUNIT *
_JIT_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer,
PyThreadState *tstate)
{
// Note that this is *not* a tail call:
- PATCH_VALUE(jit_func_preserve_none, call, _JIT_CONTINUE);
- return call(frame, stack_pointer, tstate);
+ DECLARE_TARGET(_JIT_CONTINUE);
+ return _JIT_CONTINUE(frame, stack_pointer, tstate);
}
diff --git a/Tools/jit/template.c b/Tools/jit/template.c
index 5ee26f93f1e266..d07f56e9ce6b42 100644
--- a/Tools/jit/template.c
+++ b/Tools/jit/template.c
@@ -74,10 +74,10 @@ do { \
do { \
} while (0)
-#define PATCH_JUMP(ALIAS) \
-do { \
- PATCH_VALUE(jit_func_preserve_none, jump, ALIAS); \
- __attribute__((musttail)) return jump(frame, stack_pointer, tstate); \
+#define PATCH_JUMP(ALIAS) \
+do { \
+ DECLARE_TARGET(ALIAS); \
+ __attribute__((musttail)) return ALIAS(frame, stack_pointer, tstate); \
} while (0)
#undef JUMP_TO_JUMP_TARGET
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]