https://github.com/python/cpython/commit/d77aaa73116aa469cc6b7a0f8a68f3f30fd41962
commit: d77aaa73116aa469cc6b7a0f8a68f3f30fd41962
branch: main
author: Mark Shannon <[email protected]>
committer: markshannon <[email protected]>
date: 2026-01-22T10:55:49Z
summary:
GH-139109: Partial reworking of JIT data structures (GH-144105)
* Halve size of buffers by reusing combined trace + optimizer buffers for TOS
caching
* Add simple buffer struct for more maintainable handling of buffers
* Decouple JIT structs from thread state struct
* Ensure terminator is added to trace, when optimizer gives up
files:
M Include/internal/pycore_optimizer.h
M Include/internal/pycore_optimizer_types.h
M Include/internal/pycore_tstate.h
M Include/internal/pycore_uop.h
M Python/ceval_macros.h
M Python/optimizer.c
M Python/optimizer_analysis.c
M Python/optimizer_bytecodes.c
M Python/optimizer_cases.c.h
M Python/pystate.c
diff --git a/Include/internal/pycore_optimizer.h
b/Include/internal/pycore_optimizer.h
index fbe403b492d5ac..2ee518fb82f301 100644
--- a/Include/internal/pycore_optimizer.h
+++ b/Include/internal/pycore_optimizer.h
@@ -16,12 +16,102 @@ extern "C" {
#include <stdbool.h>
+typedef struct _PyJitUopBuffer {
+ _PyUOpInstruction *start;
+ _PyUOpInstruction *next;
+ _PyUOpInstruction *end;
+} _PyJitUopBuffer;
+
+
+typedef struct _JitOptContext {
+ char done;
+ char out_of_space;
+ bool contradiction;
+ // Has the builtins dict been watched?
+ bool builtins_watched;
+ // The current "executing" frame.
+ _Py_UOpsAbstractFrame *frame;
+ _Py_UOpsAbstractFrame frames[MAX_ABSTRACT_FRAME_DEPTH];
+ int curr_frame_depth;
+
+ // Arena for the symbolic types.
+ ty_arena t_arena;
+
+ JitOptRef *n_consumed;
+ JitOptRef *limit;
+ JitOptRef locals_and_stack[MAX_ABSTRACT_INTERP_SIZE];
+ _PyJitUopBuffer out_buffer;
+} JitOptContext;
+
+
+static inline void
+uop_buffer_init(_PyJitUopBuffer *trace, _PyUOpInstruction *start, uint32_t
size)
+{
+ trace->next = trace->start = start;
+ trace->end = start + size;
+}
+
+static inline _PyUOpInstruction *
+uop_buffer_last(_PyJitUopBuffer *trace)
+{
+ assert(trace->next > trace->start);
+ return trace->next-1;
+}
+
+static inline int
+uop_buffer_length(_PyJitUopBuffer *trace)
+{
+ return (int)(trace->next - trace->start);
+}
+
+static inline int
+uop_buffer_remaining_space(_PyJitUopBuffer *trace)
+{
+ return (int)(trace->end - trace->next);
+}
+
+typedef struct _PyJitTracerInitialState {
+ int stack_depth;
+ int chain_depth;
+ struct _PyExitData *exit;
+ PyCodeObject *code; // Strong
+ PyFunctionObject *func; // Strong
+ struct _PyExecutorObject *executor; // Strong
+ _Py_CODEUNIT *start_instr;
+ _Py_CODEUNIT *close_loop_instr;
+ _Py_CODEUNIT *jump_backward_instr;
+} _PyJitTracerInitialState;
+
+typedef struct _PyJitTracerPreviousState {
+ bool dependencies_still_valid;
+ int instr_oparg;
+ int instr_stacklevel;
+ _Py_CODEUNIT *instr;
+ PyCodeObject *instr_code; // Strong
+ struct _PyInterpreterFrame *instr_frame;
+ _PyBloomFilter dependencies;
+} _PyJitTracerPreviousState;
+
+typedef struct _PyJitTracerTranslatorState {
+ int jump_backward_seen;
+} _PyJitTracerTranslatorState;
+
+typedef struct _PyJitTracerState {
+ bool is_tracing;
+ _PyJitTracerInitialState initial_state;
+ _PyJitTracerPreviousState prev_state;
+ _PyJitTracerTranslatorState translator_state;
+ JitOptContext opt_context;
+ _PyJitUopBuffer code_buffer;
+ _PyJitUopBuffer out_buffer;
+ _PyUOpInstruction uop_array[2 * UOP_MAX_TRACE_LENGTH];
+} _PyJitTracerState;
+
typedef struct _PyExecutorLinkListNode {
struct _PyExecutorObject *next;
struct _PyExecutorObject *previous;
} _PyExecutorLinkListNode;
-
typedef struct {
uint8_t opcode;
uint8_t oparg;
@@ -86,8 +176,8 @@ PyAPI_FUNC(void)
_Py_Executors_InvalidateCold(PyInterpreterState *interp);
int _Py_uop_analyze_and_optimize(
_PyThreadStateImpl *tstate,
- _PyUOpInstruction *trace, int trace_len, int curr_stackentries,
- _PyBloomFilter *dependencies);
+ _PyUOpInstruction *input, int trace_len, int curr_stackentries,
+ _PyUOpInstruction *output, _PyBloomFilter *dependencies);
extern PyTypeObject _PyUOpExecutor_Type;
diff --git a/Include/internal/pycore_optimizer_types.h
b/Include/internal/pycore_optimizer_types.h
index 7e0dbddce2d6b8..a879ca26ce7b63 100644
--- a/Include/internal/pycore_optimizer_types.h
+++ b/Include/internal/pycore_optimizer_types.h
@@ -126,27 +126,6 @@ typedef struct ty_arena {
JitOptSymbol arena[TY_ARENA_SIZE];
} ty_arena;
-typedef struct _JitOptContext {
- char done;
- char out_of_space;
- bool contradiction;
- // Has the builtins dict been watched?
- bool builtins_watched;
- // The current "executing" frame.
- _Py_UOpsAbstractFrame *frame;
- _Py_UOpsAbstractFrame frames[MAX_ABSTRACT_FRAME_DEPTH];
- int curr_frame_depth;
-
- // Arena for the symbolic types.
- ty_arena t_arena;
-
- JitOptRef *n_consumed;
- JitOptRef *limit;
- JitOptRef locals_and_stack[MAX_ABSTRACT_INTERP_SIZE];
- _PyUOpInstruction *out_buffer;
- int out_len;
-} JitOptContext;
-
#ifdef __cplusplus
}
diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h
index 24a40416c2191b..64b90710b8e664 100644
--- a/Include/internal/pycore_tstate.h
+++ b/Include/internal/pycore_tstate.h
@@ -12,7 +12,6 @@ extern "C" {
#include "pycore_freelist_state.h" // struct _Py_freelists
#include "pycore_interpframe_structs.h" // _PyInterpreterFrame
#include "pycore_mimalloc.h" // struct _mimalloc_thread_state
-#include "pycore_optimizer_types.h" // JitOptContext
#include "pycore_qsbr.h" // struct qsbr
#include "pycore_uop.h" // struct _PyUOpInstruction
#include "pycore_structs.h"
@@ -24,46 +23,6 @@ struct _gc_thread_state {
};
#endif
-#if _Py_TIER2
-typedef struct _PyJitTracerInitialState {
- int stack_depth;
- int chain_depth;
- struct _PyExitData *exit;
- PyCodeObject *code; // Strong
- PyFunctionObject *func; // Strong
- struct _PyExecutorObject *executor; // Strong
- _Py_CODEUNIT *start_instr;
- _Py_CODEUNIT *close_loop_instr;
- _Py_CODEUNIT *jump_backward_instr;
-} _PyJitTracerInitialState;
-
-typedef struct _PyJitTracerPreviousState {
- bool dependencies_still_valid;
- int code_max_size;
- int code_curr_size;
- int instr_oparg;
- int instr_stacklevel;
- _Py_CODEUNIT *instr;
- PyCodeObject *instr_code; // Strong
- struct _PyInterpreterFrame *instr_frame;
- _PyBloomFilter dependencies;
-} _PyJitTracerPreviousState;
-
-typedef struct _PyJitTracerTranslatorState {
- int jump_backward_seen;
-} _PyJitTracerTranslatorState;
-
-typedef struct _PyJitTracerState {
- bool is_tracing;
- _PyJitTracerInitialState initial_state;
- _PyJitTracerPreviousState prev_state;
- _PyJitTracerTranslatorState translator_state;
- JitOptContext opt_context;
- _PyUOpInstruction code_buffer[UOP_MAX_TRACE_LENGTH];
- _PyUOpInstruction out_buffer[UOP_MAX_TRACE_LENGTH];
-} _PyJitTracerState;
-
-#endif
// Every PyThreadState is actually allocated as a _PyThreadStateImpl. The
// PyThreadState fields are exposed as part of the C API, although most fields
@@ -141,7 +100,7 @@ typedef struct _PyThreadStateImpl {
Py_ssize_t reftotal; // this thread's total refcount operations
#endif
#if _Py_TIER2
- _PyJitTracerState *jit_tracer_state;
+ struct _PyJitTracerState *jit_tracer_state;
#endif
} _PyThreadStateImpl;
diff --git a/Include/internal/pycore_uop.h b/Include/internal/pycore_uop.h
index e828a1cc5a5722..f9be01acb57197 100644
--- a/Include/internal/pycore_uop.h
+++ b/Include/internal/pycore_uop.h
@@ -38,11 +38,10 @@ typedef struct _PyUOpInstruction{
// This is the length of the trace we translate initially.
#ifdef Py_DEBUG
// With asserts, the stencils are a lot larger
-#define UOP_MAX_TRACE_LENGTH 2000
+#define UOP_MAX_TRACE_LENGTH 1000
#else
-#define UOP_MAX_TRACE_LENGTH 5000
+#define UOP_MAX_TRACE_LENGTH 2500
#endif
-#define UOP_BUFFER_SIZE (UOP_MAX_TRACE_LENGTH * sizeof(_PyUOpInstruction))
/* Bloom filter with m = 256
* https://en.wikipedia.org/wiki/Bloom_filter */
diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h
index 3b4b3253b3638c..d791ba0e8eca97 100644
--- a/Python/ceval_macros.h
+++ b/Python/ceval_macros.h
@@ -433,7 +433,7 @@ do { \
JUMP_TO_LABEL(error); \
} \
if (keep_tracing_bit) { \
- assert(((_PyThreadStateImpl
*)tstate)->jit_tracer_state->prev_state.code_curr_size == 2); \
+ assert(uop_buffer_length(&((_PyThreadStateImpl
*)tstate)->jit_tracer_state->code_buffer)); \
ENTER_TRACING(); \
DISPATCH_NON_TRACING(); \
} \
diff --git a/Python/optimizer.c b/Python/optimizer.c
index 15a1eb5a17745b..f25242972efeb1 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -188,9 +188,6 @@ _PyOptimizer_Optimize(
}
insert_executor(code, start, index, executor);
}
- else {
- executor->vm_data.code = NULL;
- }
executor->vm_data.chain_depth = chain_depth;
assert(executor->vm_data.valid);
_PyExitData *exit = _tstate->jit_tracer_state->initial_state.exit;
@@ -547,52 +544,43 @@ guard_ip_uop[MAX_UOP_ID + 1] = {
#endif
-static inline int
+static inline void
add_to_trace(
- _PyUOpInstruction *trace,
- int trace_length,
+ _PyJitUopBuffer *trace,
uint16_t opcode,
uint16_t oparg,
uint64_t operand,
uint32_t target)
{
- trace[trace_length].opcode = opcode;
- trace[trace_length].format = UOP_FORMAT_TARGET;
- trace[trace_length].target = target;
- trace[trace_length].oparg = oparg;
- trace[trace_length].operand0 = operand;
+ _PyUOpInstruction *inst = trace->next;
+ inst->opcode = opcode;
+ inst->format = UOP_FORMAT_TARGET;
+ inst->target = target;
+ inst->oparg = oparg;
+ inst->operand0 = operand;
#ifdef Py_STATS
- trace[trace_length].execution_count = 0;
+ inst->execution_count = 0;
#endif
- return trace_length + 1;
+ trace->next++;
}
+
#ifdef Py_DEBUG
#define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
- assert(trace_length < max_length); \
- trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG),
(OPERAND), (TARGET)); \
+ add_to_trace(trace, (OPCODE), (OPARG), (OPERAND), (TARGET)); \
if (lltrace >= 2) { \
- printf("%4d ADD_TO_TRACE: ", trace_length); \
- _PyUOpPrint(&trace[trace_length-1]); \
+ printf("%4d ADD_TO_TRACE: ", uop_buffer_length(trace)); \
+ _PyUOpPrint(uop_buffer_last(trace)); \
printf("\n"); \
}
#else
#define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
- assert(trace_length < max_length); \
- trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG),
(OPERAND), (TARGET));
+ add_to_trace(trace, (OPCODE), (OPARG), (OPERAND), (TARGET))
#endif
#define INSTR_IP(INSTR, CODE) \
((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))
-// Reserve space for n uops
-#define RESERVE_RAW(n, opname) \
- if (trace_length + (n) > max_length) { \
- DPRINTF(2, "No room for %s (need %d, got %d)\n", \
- (opname), (n), max_length - trace_length); \
- OPT_STAT_INC(trace_too_long); \
- goto full; \
- }
static int
is_terminator(const _PyUOpInstruction *uop)
@@ -629,9 +617,7 @@ _PyJit_translate_single_bytecode_to_trace(
PyCodeObject *old_code = tracer->prev_state.instr_code;
bool progress_needed = (tracer->initial_state.chain_depth %
MAX_CHAIN_DEPTH) == 0;
_PyBloomFilter *dependencies = &tracer->prev_state.dependencies;
- int trace_length = tracer->prev_state.code_curr_size;
- _PyUOpInstruction *trace = tracer->code_buffer;
- int max_length = tracer->prev_state.code_max_size;
+ _PyJitUopBuffer *trace = &tracer->code_buffer;
_Py_CODEUNIT *this_instr = tracer->prev_state.instr;
_Py_CODEUNIT *target_instr = this_instr;
@@ -670,15 +656,13 @@ _PyJit_translate_single_bytecode_to_trace(
}
}
- int old_stack_level = tracer->prev_state.instr_stacklevel;
-
// Strange control-flow
bool has_dynamic_jump_taken = OPCODE_HAS_UNPREDICTABLE_JUMP(opcode) &&
(next_instr != this_instr + 1 +
_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]);
/* Special case the first instruction,
* so that we can guarantee forward progress */
- if (progress_needed && tracer->prev_state.code_curr_size <
CODE_SIZE_NO_PROGRESS) {
+ if (progress_needed && uop_buffer_length(&tracer->code_buffer) <
CODE_SIZE_NO_PROGRESS) {
if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) {
opcode = _PyOpcode_Deopt[opcode];
}
@@ -694,7 +678,7 @@ _PyJit_translate_single_bytecode_to_trace(
int is_sys_tracing = (tstate->c_tracefunc != NULL) ||
(tstate->c_profilefunc != NULL);
if (is_sys_tracing) {
- goto full;
+ goto done;
}
if (stop_tracing_opcode == _DEOPT) {
@@ -710,7 +694,7 @@ _PyJit_translate_single_bytecode_to_trace(
goto done;
}
- DPRINTF(2, "%p %d: %s(%d) %d %d\n", old_code, target,
_PyOpcode_OpName[opcode], oparg, needs_guard_ip, old_stack_level);
+ DPRINTF(2, "%p %d: %s(%d) %d\n", old_code, target,
_PyOpcode_OpName[opcode], oparg, needs_guard_ip);
#ifdef Py_DEBUG
if (oparg > 255) {
@@ -719,7 +703,7 @@ _PyJit_translate_single_bytecode_to_trace(
#endif
if (!tracer->prev_state.dependencies_still_valid) {
- goto full;
+ goto done;
}
// This happens when a recursive call happens that we can't trace. Such as
Python -> C -> Python calls
@@ -734,16 +718,14 @@ _PyJit_translate_single_bytecode_to_trace(
unsupported:
{
// Rewind to previous instruction and replace with _EXIT_TRACE.
- _PyUOpInstruction *curr = &trace[trace_length-1];
- while (curr->opcode != _SET_IP && trace_length > 2) {
- trace_length--;
- curr = &trace[trace_length-1];
+ _PyUOpInstruction *curr = uop_buffer_last(trace);
+ while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) {
+ trace->next--;
+ curr = uop_buffer_last(trace);
}
- assert(curr->opcode == _SET_IP || trace_length == 2);
+ assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2);
if (curr->opcode == _SET_IP) {
int32_t old_target = (int32_t)uop_get_target(curr);
- curr++;
- trace_length++;
curr->opcode = _DEOPT;
curr->format = UOP_FORMAT_TARGET;
curr->target = old_target;
@@ -752,7 +734,6 @@ _PyJit_translate_single_bytecode_to_trace(
}
}
-
if (opcode == NOP) {
return 1;
}
@@ -766,7 +747,7 @@ _PyJit_translate_single_bytecode_to_trace(
}
// One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
- max_length -= 2;
+ trace->end -= 2;
const struct opcode_macro_expansion *expansion =
&_PyOpcode_macro_expansion[opcode];
@@ -775,18 +756,28 @@ _PyJit_translate_single_bytecode_to_trace(
if (OPCODE_HAS_EXIT(opcode)) {
- // Make space for side exit and final _EXIT_TRACE:
- max_length--;
+ // Make space for side exit
+ trace->end--;
}
if (OPCODE_HAS_ERROR(opcode)) {
- // Make space for error stub and final _EXIT_TRACE:
- max_length--;
+ // Make space for error stub
+ trace->end--;
+ }
+ if (OPCODE_HAS_DEOPT(opcode)) {
+ // Make space for side exit
+ trace->end--;
}
// _GUARD_IP leads to an exit.
- max_length -= needs_guard_ip;
+ trace->end -= needs_guard_ip;
- RESERVE_RAW(expansion->nuops + needs_guard_ip + 2 +
(!OPCODE_HAS_NO_SAVE_IP(opcode)), "uop and various checks");
+ int space_needed = expansion->nuops + needs_guard_ip + 2 +
(!OPCODE_HAS_NO_SAVE_IP(opcode));
+ if (uop_buffer_remaining_space(trace) < space_needed) {
+ DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n",
+ space_needed, uop_buffer_remaining_space(trace));
+ OPT_STAT_INC(trace_too_long);
+ goto done;
+ }
ADD_TO_TRACE(_CHECK_VALIDITY, 0, 0, target);
@@ -825,7 +816,7 @@ _PyJit_translate_single_bytecode_to_trace(
{
if ((next_instr != tracer->initial_state.close_loop_instr) &&
(next_instr != tracer->initial_state.start_instr) &&
- tracer->prev_state.code_curr_size > CODE_SIZE_NO_PROGRESS &&
+ uop_buffer_length(&tracer->code_buffer) >
CODE_SIZE_NO_PROGRESS &&
// For side exits, we don't want to terminate them early.
tracer->initial_state.exit == NULL &&
// These are coroutines, and we want to unroll those usually.
@@ -836,7 +827,7 @@ _PyJit_translate_single_bytecode_to_trace(
// inner loop might start and let the traces rejoin.
OPT_STAT_INC(inner_loop);
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
- trace[trace_length-1].operand1 = true; // is_control_flow
+ uop_buffer_last(trace)->operand1 = true; // is_control_flow
DPRINTF(2, "JUMP_BACKWARD not to top ends trace %p %p %p\n",
next_instr,
tracer->initial_state.close_loop_instr,
tracer->initial_state.start_instr);
goto done;
@@ -913,19 +904,19 @@ _PyJit_translate_single_bytecode_to_trace(
}
break;
case OPERAND1_1:
- assert(trace[trace_length-1].opcode == uop);
+ assert(uop_buffer_last(trace)->opcode == uop);
operand = read_u16(&this_instr[offset].cache);
- trace[trace_length-1].operand1 = operand;
+ uop_buffer_last(trace)->operand1 = operand;
continue;
case OPERAND1_2:
- assert(trace[trace_length-1].opcode == uop);
+ assert(uop_buffer_last(trace)->opcode == uop);
operand = read_u32(&this_instr[offset].cache);
- trace[trace_length-1].operand1 = operand;
+ uop_buffer_last(trace)->operand1 = operand;
continue;
case OPERAND1_4:
- assert(trace[trace_length-1].opcode == uop);
+ assert(uop_buffer_last(trace)->opcode == uop);
operand = read_u64(&this_instr[offset].cache);
- trace[trace_length-1].operand1 = operand;
+ uop_buffer_last(trace)->operand1 = operand;
continue;
default:
fprintf(stderr,
@@ -955,7 +946,7 @@ _PyJit_translate_single_bytecode_to_trace(
}
}
ADD_TO_TRACE(uop, oparg, operand, target);
- trace[trace_length - 1].operand1 =
PyStackRef_IsNone(frame->f_executable) ? 2 : ((int)(frame->stackpointer -
_PyFrame_Stackbase(frame)));
+ uop_buffer_last(trace)->operand1 =
PyStackRef_IsNone(frame->f_executable) ? 2 : ((int)(frame->stackpointer -
_PyFrame_Stackbase(frame)));
break;
}
if (uop == _BINARY_OP_INPLACE_ADD_UNICODE) {
@@ -973,9 +964,9 @@ _PyJit_translate_single_bytecode_to_trace(
} // End switch (opcode)
if (needs_guard_ip) {
- uint16_t guard_ip = guard_ip_uop[trace[trace_length-1].opcode];
+ uint16_t guard_ip = guard_ip_uop[uop_buffer_last(trace)->opcode];
if (guard_ip == 0) {
- DPRINTF(1, "Unknown uop needing guard ip %s\n",
_PyOpcode_uop_name[trace[trace_length-1].opcode]);
+ DPRINTF(1, "Unknown uop needing guard ip %s\n",
_PyOpcode_uop_name[uop_buffer_last(trace)->opcode]);
Py_UNREACHABLE();
}
ADD_TO_TRACE(guard_ip, 0, (uintptr_t)next_instr, 0);
@@ -983,7 +974,7 @@ _PyJit_translate_single_bytecode_to_trace(
// Loop back to the start
int is_first_instr = tracer->initial_state.close_loop_instr == next_instr
||
tracer->initial_state.start_instr == next_instr;
- if (is_first_instr && tracer->prev_state.code_curr_size >
CODE_SIZE_NO_PROGRESS) {
+ if (is_first_instr && uop_buffer_length(trace) > CODE_SIZE_NO_PROGRESS) {
if (needs_guard_ip) {
ADD_TO_TRACE(_SET_IP, 0, (uintptr_t)next_instr, 0);
}
@@ -991,27 +982,13 @@ _PyJit_translate_single_bytecode_to_trace(
goto done;
}
DPRINTF(2, "Trace continuing\n");
- tracer->prev_state.code_curr_size = trace_length;
- tracer->prev_state.code_max_size = max_length;
return 1;
done:
DPRINTF(2, "Trace done\n");
- tracer->prev_state.code_curr_size = trace_length;
- tracer->prev_state.code_max_size = max_length;
- return 0;
-full:
- DPRINTF(2, "Trace full\n");
- if (!is_terminator(&tracer->code_buffer[trace_length-1])) {
- // Undo the last few instructions.
- trace_length = tracer->prev_state.code_curr_size;
- max_length = tracer->prev_state.code_max_size;
- // We previously reversed one.
- max_length += 1;
+ if (!is_terminator(uop_buffer_last(trace))) {
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
- trace[trace_length-1].operand1 = true; // is_control_flow
+ uop_buffer_last(trace)->operand1 = true; // is_control_flow
}
- tracer->prev_state.code_curr_size = trace_length;
- tracer->prev_state.code_max_size = max_length;
return 0;
}
@@ -1059,11 +1036,12 @@ _PyJit_TryInitializeTracing(
2 * INSTR_IP(close_loop_instr, code),
chain_depth);
#endif
- add_to_trace(tracer->code_buffer, 0, _START_EXECUTOR, 0,
(uintptr_t)start_instr, INSTR_IP(start_instr, code));
- add_to_trace(tracer->code_buffer, 1, _MAKE_WARM, 0, 0, 0);
- tracer->prev_state.code_curr_size = CODE_SIZE_EMPTY;
+ /* Set up tracing buffer*/
+ _PyJitUopBuffer *trace = &tracer->code_buffer;
+ uop_buffer_init(trace, &tracer->uop_array[0], UOP_MAX_TRACE_LENGTH);
+ ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)start_instr,
INSTR_IP(start_instr, code));
+ ADD_TO_TRACE(_MAKE_WARM, 0, 0, 0);
- tracer->prev_state.code_max_size = UOP_MAX_TRACE_LENGTH/2;
tracer->initial_state.start_instr = start_instr;
tracer->initial_state.close_loop_instr = close_loop_instr;
tracer->initial_state.code = (PyCodeObject *)Py_NewRef(code);
@@ -1122,8 +1100,7 @@ _PyJit_FinalizeTracing(PyThreadState *tstate, int err)
Py_CLEAR(tracer->initial_state.func);
Py_CLEAR(tracer->initial_state.executor);
Py_CLEAR(tracer->prev_state.instr_code);
- tracer->prev_state.code_curr_size = CODE_SIZE_EMPTY;
- tracer->prev_state.code_max_size = UOP_MAX_TRACE_LENGTH/2 - 1;
+ uop_buffer_init(&tracer->code_buffer, &tracer->uop_array[0],
UOP_MAX_TRACE_LENGTH);
tracer->is_tracing = false;
}
@@ -1137,7 +1114,6 @@ _PyJit_TracerFree(_PyThreadStateImpl *_tstate)
}
#undef RESERVE
-#undef RESERVE_RAW
#undef INSTR_IP
#undef ADD_TO_TRACE
#undef DPRINTF
@@ -1467,39 +1443,47 @@ int effective_trace_length(_PyUOpInstruction *buffer,
int length)
static int
-stack_allocate(_PyUOpInstruction *buffer, int length)
+stack_allocate(_PyUOpInstruction *buffer, _PyUOpInstruction *output, int
length)
{
assert(buffer[0].opcode == _START_EXECUTOR);
- for (int i = length-1; i >= 0; i--) {
- buffer[i*2+1] = buffer[i];
- buffer[i*2].format = UOP_FORMAT_TARGET;
- buffer[i*2].oparg = 0;
- buffer[i*2].target = 0;
+ /* The input buffer and output buffers will overlap.
+ Make sure that we can move instructions to the output
+ without overwriting the input. */
+ if (buffer == output) {
+ // This can only happen if optimizer has not been run
+ for (int i = 0; i < length; i++) {
+ buffer[i + UOP_MAX_TRACE_LENGTH] = buffer[i];
+ }
+ buffer += UOP_MAX_TRACE_LENGTH;
+ }
+ else {
+ assert(output + UOP_MAX_TRACE_LENGTH == buffer);
}
int depth = 0;
+ _PyUOpInstruction *write = output;
for (int i = 0; i < length; i++) {
- _PyUOpInstruction *spill_or_reload = &buffer[i*2];
- int uop = buffer[i*2+1].opcode;
+ int uop = buffer[i].opcode;
if (uop == _NOP) {
- // leave _NOPs to be cleaned up later
- spill_or_reload->opcode = _NOP;
continue;
}
int new_depth = _PyUop_Caching[uop].best[depth];
- if (new_depth == depth) {
- spill_or_reload->opcode = _NOP;
- }
- else {
- spill_or_reload->opcode =
_PyUop_SpillsAndReloads[depth][new_depth];
+ if (new_depth != depth) {
+ write->opcode = _PyUop_SpillsAndReloads[depth][new_depth];
+ assert(write->opcode != 0);
+ write->format = UOP_FORMAT_TARGET;
+ write->oparg = 0;
+ write->target = 0;
+ write++;
depth = new_depth;
}
+ *write = buffer[i];
uint16_t new_opcode = _PyUop_Caching[uop].entries[depth].opcode;
assert(new_opcode != 0);
- assert(spill_or_reload->opcode != 0);
- buffer[i*2+1].opcode = new_opcode;
+ write->opcode = new_opcode;
+ write++;
depth = _PyUop_Caching[uop].entries[depth].output;
}
- return length*2;
+ return write - output;
}
static int
@@ -1512,28 +1496,28 @@ uop_optimize(
_PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
assert(_tstate->jit_tracer_state != NULL);
_PyBloomFilter *dependencies =
&_tstate->jit_tracer_state->prev_state.dependencies;
- _PyUOpInstruction *buffer = _tstate->jit_tracer_state->code_buffer;
+ _PyUOpInstruction *buffer = _tstate->jit_tracer_state->code_buffer.start;
OPT_STAT_INC(attempts);
bool is_noopt = !tstate->interp->opt_config.uops_optimize_enabled;
int curr_stackentries =
_tstate->jit_tracer_state->initial_state.stack_depth;
- int length = _tstate->jit_tracer_state->prev_state.code_curr_size;
+ int length = uop_buffer_length(&_tstate->jit_tracer_state->code_buffer);
if (length <= CODE_SIZE_NO_PROGRESS) {
return 0;
}
assert(length > 0);
- assert(length < UOP_MAX_TRACE_LENGTH/2);
+ assert(length < UOP_MAX_TRACE_LENGTH);
OPT_STAT_INC(traces_created);
if (!is_noopt) {
+ _PyUOpInstruction *output =
&_tstate->jit_tracer_state->uop_array[UOP_MAX_TRACE_LENGTH];
length = _Py_uop_analyze_and_optimize(
- _tstate,
- buffer, length,
- curr_stackentries, dependencies);
+ _tstate, buffer, length, curr_stackentries,
+ output, dependencies);
if (length <= 0) {
return length;
}
- buffer = _tstate->jit_tracer_state->out_buffer;
+ buffer = output;
}
- assert(length < UOP_MAX_TRACE_LENGTH/2);
+ assert(length < UOP_MAX_TRACE_LENGTH);
assert(length >= 1);
/* Fix up */
for (int pc = 0; pc < length; pc++) {
@@ -1549,7 +1533,9 @@ uop_optimize(
assert(_PyOpcode_uop_name[buffer[pc].opcode]);
}
OPT_HIST(effective_trace_length(buffer, length),
optimized_trace_length_hist);
- length = stack_allocate(buffer, length);
+ _PyUOpInstruction *output = &_tstate->jit_tracer_state->uop_array[0];
+ length = stack_allocate(buffer, output, length);
+ buffer = output;
length = prepare_for_execution(buffer, length);
assert(length <= UOP_MAX_TRACE_LENGTH);
_PyExecutorObject *executor = make_executor_from_uops(
@@ -1707,6 +1693,7 @@ _Py_ExecutorInit(_PyExecutorObject *executor, const
_PyBloomFilter *dependency_s
{
executor->vm_data.valid = true;
executor->vm_data.pending_deletion = 0;
+ executor->vm_data.code = NULL;
for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) {
executor->vm_data.bloom.bits[i] = dependency_set->bits[i];
}
diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c
index e4e259a81b510f..c6a1ae60a317fa 100644
--- a/Python/optimizer_analysis.c
+++ b/Python/optimizer_analysis.c
@@ -203,14 +203,14 @@ static inline void
add_op(JitOptContext *ctx, _PyUOpInstruction *this_instr,
uint16_t opcode, uint16_t oparg, uintptr_t operand0)
{
- _PyUOpInstruction *out = &ctx->out_buffer[ctx->out_len];
+ _PyUOpInstruction *out = ctx->out_buffer.next;
out->opcode = (opcode);
out->format = this_instr->format;
out->oparg = (oparg);
out->target = this_instr->target;
out->operand0 = (operand0);
out->operand1 = this_instr->operand1;
- ctx->out_len++;
+ ctx->out_buffer.next++;
}
/* Shortened forms for convenience, used in optimizer_bytecodes.c */
@@ -430,6 +430,7 @@ optimize_uops(
_PyUOpInstruction *trace,
int trace_len,
int curr_stacklen,
+ _PyUOpInstruction *output,
_PyBloomFilter *dependencies
)
{
@@ -440,7 +441,7 @@ optimize_uops(
JitOptContext *ctx = &tstate->jit_tracer_state->opt_context;
uint32_t opcode = UINT16_MAX;
- ctx->out_buffer = tstate->jit_tracer_state->out_buffer;
+ uop_buffer_init(&ctx->out_buffer, output, UOP_MAX_TRACE_LENGTH);
// Make sure that watchers are set up
PyInterpreterState *interp = _PyInterpreterState_GET();
@@ -458,14 +459,20 @@ optimize_uops(
ctx->curr_frame_depth++;
ctx->frame = frame;
- ctx->out_len = 0;
-
_PyUOpInstruction *this_instr = NULL;
JitOptRef *stack_pointer = ctx->frame->stack_pointer;
- for (int i = 0; !ctx->done; i++) {
- assert(i < trace_len);
+ for (int i = 0; i < trace_len; i++) {
this_instr = &trace[i];
+ if (ctx->done) {
+ // Don't do any more optimization, but
+ // we still need to reach a terminator for corrctness.
+ *(ctx->out_buffer.next++) = *this_instr;
+ if (is_terminator_uop(this_instr)) {
+ break;
+ }
+ continue;
+ }
int oparg = this_instr->oparg;
opcode = this_instr->opcode;
@@ -485,6 +492,8 @@ optimize_uops(
}
#endif
+ _PyUOpInstruction *out_ptr = ctx->out_buffer.next;
+
switch (opcode) {
#include "optimizer_cases.c.h"
@@ -494,8 +503,8 @@ optimize_uops(
Py_UNREACHABLE();
}
// If no ADD_OP was called during this iteration, copy the original
instruction
- if (ctx->out_len == i) {
- ctx->out_buffer[ctx->out_len++] = *this_instr;
+ if (ctx->out_buffer.next == out_ptr) {
+ *(ctx->out_buffer.next++) = *this_instr;
}
assert(ctx->frame != NULL);
if (!CURRENT_FRAME_IS_INIT_SHIM()) {
@@ -526,20 +535,11 @@ optimize_uops(
* would be no benefit in retrying later */
_Py_uop_abstractcontext_fini(ctx);
// Check that the trace ends with a proper terminator
- if (ctx->out_len > 0) {
- _PyUOpInstruction *last_uop = &ctx->out_buffer[ctx->out_len - 1];
- if (!is_terminator_uop(last_uop)) {
- // Copy remaining uops from original trace until we find a
terminator
- for (int i = ctx->out_len; i < trace_len; i++) {
- ctx->out_buffer[ctx->out_len++] = trace[i];
- if (is_terminator_uop(&trace[i])) {
- break;
- }
- }
- }
+ if (uop_buffer_length(&ctx->out_buffer) > 0) {
+ assert(is_terminator_uop(uop_buffer_last(&ctx->out_buffer)));
}
- return ctx->out_len;
+ return uop_buffer_length(&ctx->out_buffer);
error:
DPRINTF(3, "\n");
@@ -696,14 +696,15 @@ _Py_uop_analyze_and_optimize(
_PyUOpInstruction *buffer,
int length,
int curr_stacklen,
+ _PyUOpInstruction *output,
_PyBloomFilter *dependencies
)
{
OPT_STAT_INC(optimizer_attempts);
length = optimize_uops(
- tstate, buffer,
- length, curr_stacklen, dependencies);
+ tstate, buffer, length, curr_stacklen,
+ output, dependencies);
if (length == 0) {
return length;
@@ -711,7 +712,7 @@ _Py_uop_analyze_and_optimize(
assert(length > 0);
- length = remove_unneeded_uops(tstate->jit_tracer_state->out_buffer,
length);
+ length = remove_unneeded_uops(output, length);
assert(length > 0);
OPT_STAT_INC(optimizer_successes);
diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c
index 0ccc788dff962d..1584e731d1b2d4 100644
--- a/Python/optimizer_bytecodes.c
+++ b/Python/optimizer_bytecodes.c
@@ -194,7 +194,6 @@ dummy_func(void) {
_Py_BloomFilter_Add(dependencies, type);
}
}
-
}
}
@@ -798,7 +797,7 @@ dummy_func(void) {
if (sym_is_const(ctx, callable) && sym_matches_type(callable,
&PyFunction_Type)) {
assert(PyFunction_Check(sym_get_const(ctx, callable)));
ADD_OP(_CHECK_FUNCTION_VERSION_INLINE, 0, func_version);
- ctx->out_buffer[ctx->out_len - 1].operand1 =
(uintptr_t)sym_get_const(ctx, callable);
+ uop_buffer_last(&ctx->out_buffer)->operand1 =
(uintptr_t)sym_get_const(ctx, callable);
}
sym_set_type(callable, &PyFunction_Type);
}
@@ -808,7 +807,7 @@ dummy_func(void) {
PyMethodObject *method = (PyMethodObject *)sym_get_const(ctx,
callable);
assert(PyMethod_Check(method));
ADD_OP(_CHECK_FUNCTION_VERSION_INLINE, 0, func_version);
- ctx->out_buffer[ctx->out_len - 1].operand1 =
(uintptr_t)method->im_func;
+ uop_buffer_last(&ctx->out_buffer)->operand1 =
(uintptr_t)method->im_func;
}
sym_set_type(callable, &PyMethod_Type);
}
@@ -1570,7 +1569,7 @@ dummy_func(void) {
ctx->frame->globals_watched = true;
}
if (ctx->frame->globals_checked_version != version &&
this_instr[-1].opcode == _NOP) {
- REPLACE_OP(&ctx->out_buffer[ctx->out_len - 1],
_GUARD_GLOBALS_VERSION, 0, version);
+ REPLACE_OP(uop_buffer_last(&ctx->out_buffer),
_GUARD_GLOBALS_VERSION, 0, version);
ctx->frame->globals_checked_version = version;
}
if (ctx->frame->globals_checked_version == version) {
diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h
index f62e15b987c0eb..341805d51e24cd 100644
--- a/Python/optimizer_cases.c.h
+++ b/Python/optimizer_cases.c.h
@@ -1557,7 +1557,7 @@
ctx->frame->globals_watched = true;
}
if (ctx->frame->globals_checked_version != version &&
this_instr[-1].opcode == _NOP) {
- REPLACE_OP(&ctx->out_buffer[ctx->out_len - 1],
_GUARD_GLOBALS_VERSION, 0, version);
+ REPLACE_OP(uop_buffer_last(&ctx->out_buffer),
_GUARD_GLOBALS_VERSION, 0, version);
ctx->frame->globals_checked_version = version;
}
if (ctx->frame->globals_checked_version == version) {
@@ -2861,7 +2861,7 @@
if (sym_is_const(ctx, callable) && sym_matches_type(callable,
&PyFunction_Type)) {
assert(PyFunction_Check(sym_get_const(ctx, callable)));
ADD_OP(_CHECK_FUNCTION_VERSION_INLINE, 0, func_version);
- ctx->out_buffer[ctx->out_len - 1].operand1 =
(uintptr_t)sym_get_const(ctx, callable);
+ uop_buffer_last(&ctx->out_buffer)->operand1 =
(uintptr_t)sym_get_const(ctx, callable);
}
sym_set_type(callable, &PyFunction_Type);
break;
@@ -2879,7 +2879,7 @@
PyMethodObject *method = (PyMethodObject *)sym_get_const(ctx,
callable);
assert(PyMethod_Check(method));
ADD_OP(_CHECK_FUNCTION_VERSION_INLINE, 0, func_version);
- ctx->out_buffer[ctx->out_len - 1].operand1 =
(uintptr_t)method->im_func;
+ uop_buffer_last(&ctx->out_buffer)->operand1 =
(uintptr_t)method->im_func;
}
sym_set_type(callable, &PyMethod_Type);
break;
diff --git a/Python/pystate.c b/Python/pystate.c
index 89374e16722494..19f1245d60a2f8 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -24,7 +24,6 @@
#include "pycore_stackref.h" // Py_STACKREF_DEBUG
#include "pycore_stats.h" // FT_STAT_WORLD_STOP_INC()
#include "pycore_time.h" // _PyTime_Init()
-#include "pycore_uop.h" // UOP_BUFFER_SIZE
#include "pycore_uniqueid.h" // _PyObject_FinalizePerThreadRefcounts()
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]