https://github.com/python/cpython/commit/d77aaa73116aa469cc6b7a0f8a68f3f30fd41962
commit: d77aaa73116aa469cc6b7a0f8a68f3f30fd41962
branch: main
author: Mark Shannon <[email protected]>
committer: markshannon <[email protected]>
date: 2026-01-22T10:55:49Z
summary:

GH-139109: Partial reworking of JIT data structures (GH-144105)

* Halve size of buffers by reusing combined trace + optimizer buffers for TOS 
caching
* Add simple buffer struct for more maintainable handling of buffers
* Decouple JIT structs from thread state struct
* Ensure terminator is added to trace, when optimizer gives up

files:
M Include/internal/pycore_optimizer.h
M Include/internal/pycore_optimizer_types.h
M Include/internal/pycore_tstate.h
M Include/internal/pycore_uop.h
M Python/ceval_macros.h
M Python/optimizer.c
M Python/optimizer_analysis.c
M Python/optimizer_bytecodes.c
M Python/optimizer_cases.c.h
M Python/pystate.c

diff --git a/Include/internal/pycore_optimizer.h 
b/Include/internal/pycore_optimizer.h
index fbe403b492d5ac..2ee518fb82f301 100644
--- a/Include/internal/pycore_optimizer.h
+++ b/Include/internal/pycore_optimizer.h
@@ -16,12 +16,102 @@ extern "C" {
 #include <stdbool.h>
 
 
+typedef struct _PyJitUopBuffer {
+    _PyUOpInstruction *start;
+    _PyUOpInstruction *next;
+    _PyUOpInstruction *end;
+} _PyJitUopBuffer;
+
+
+typedef struct _JitOptContext {
+    char done;
+    char out_of_space;
+    bool contradiction;
+    // Has the builtins dict been watched?
+    bool builtins_watched;
+    // The current "executing" frame.
+    _Py_UOpsAbstractFrame *frame;
+    _Py_UOpsAbstractFrame frames[MAX_ABSTRACT_FRAME_DEPTH];
+    int curr_frame_depth;
+
+    // Arena for the symbolic types.
+    ty_arena t_arena;
+
+    JitOptRef *n_consumed;
+    JitOptRef *limit;
+    JitOptRef locals_and_stack[MAX_ABSTRACT_INTERP_SIZE];
+    _PyJitUopBuffer out_buffer;
+} JitOptContext;
+
+
+static inline void
+uop_buffer_init(_PyJitUopBuffer *trace, _PyUOpInstruction *start, uint32_t 
size)
+{
+    trace->next = trace->start = start;
+    trace->end = start + size;
+}
+
+static inline _PyUOpInstruction *
+uop_buffer_last(_PyJitUopBuffer *trace)
+{
+    assert(trace->next > trace->start);
+    return trace->next-1;
+}
+
+static inline int
+uop_buffer_length(_PyJitUopBuffer *trace)
+{
+    return (int)(trace->next - trace->start);
+}
+
+static inline int
+uop_buffer_remaining_space(_PyJitUopBuffer *trace)
+{
+    return (int)(trace->end - trace->next);
+}
+
+typedef struct _PyJitTracerInitialState {
+    int stack_depth;
+    int chain_depth;
+    struct _PyExitData *exit;
+    PyCodeObject *code; // Strong
+    PyFunctionObject *func; // Strong
+    struct _PyExecutorObject *executor; // Strong
+    _Py_CODEUNIT *start_instr;
+    _Py_CODEUNIT *close_loop_instr;
+    _Py_CODEUNIT *jump_backward_instr;
+} _PyJitTracerInitialState;
+
+typedef struct _PyJitTracerPreviousState {
+    bool dependencies_still_valid;
+    int instr_oparg;
+    int instr_stacklevel;
+    _Py_CODEUNIT *instr;
+    PyCodeObject *instr_code; // Strong
+    struct _PyInterpreterFrame *instr_frame;
+    _PyBloomFilter dependencies;
+} _PyJitTracerPreviousState;
+
+typedef struct _PyJitTracerTranslatorState {
+    int jump_backward_seen;
+} _PyJitTracerTranslatorState;
+
+typedef struct _PyJitTracerState {
+    bool is_tracing;
+    _PyJitTracerInitialState initial_state;
+    _PyJitTracerPreviousState prev_state;
+    _PyJitTracerTranslatorState translator_state;
+    JitOptContext opt_context;
+    _PyJitUopBuffer code_buffer;
+    _PyJitUopBuffer out_buffer;
+    _PyUOpInstruction uop_array[2 * UOP_MAX_TRACE_LENGTH];
+} _PyJitTracerState;
+
 typedef struct _PyExecutorLinkListNode {
     struct _PyExecutorObject *next;
     struct _PyExecutorObject *previous;
 } _PyExecutorLinkListNode;
 
-
 typedef struct {
     uint8_t opcode;
     uint8_t oparg;
@@ -86,8 +176,8 @@ PyAPI_FUNC(void) 
_Py_Executors_InvalidateCold(PyInterpreterState *interp);
 
 int _Py_uop_analyze_and_optimize(
     _PyThreadStateImpl *tstate,
-    _PyUOpInstruction *trace, int trace_len, int curr_stackentries,
-    _PyBloomFilter *dependencies);
+    _PyUOpInstruction *input, int trace_len, int curr_stackentries,
+    _PyUOpInstruction *output, _PyBloomFilter *dependencies);
 
 extern PyTypeObject _PyUOpExecutor_Type;
 
diff --git a/Include/internal/pycore_optimizer_types.h 
b/Include/internal/pycore_optimizer_types.h
index 7e0dbddce2d6b8..a879ca26ce7b63 100644
--- a/Include/internal/pycore_optimizer_types.h
+++ b/Include/internal/pycore_optimizer_types.h
@@ -126,27 +126,6 @@ typedef struct ty_arena {
     JitOptSymbol arena[TY_ARENA_SIZE];
 } ty_arena;
 
-typedef struct _JitOptContext {
-    char done;
-    char out_of_space;
-    bool contradiction;
-    // Has the builtins dict been watched?
-    bool builtins_watched;
-    // The current "executing" frame.
-    _Py_UOpsAbstractFrame *frame;
-    _Py_UOpsAbstractFrame frames[MAX_ABSTRACT_FRAME_DEPTH];
-    int curr_frame_depth;
-
-    // Arena for the symbolic types.
-    ty_arena t_arena;
-
-    JitOptRef *n_consumed;
-    JitOptRef *limit;
-    JitOptRef locals_and_stack[MAX_ABSTRACT_INTERP_SIZE];
-    _PyUOpInstruction *out_buffer;
-    int out_len;
-} JitOptContext;
-
 
 #ifdef __cplusplus
 }
diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h
index 24a40416c2191b..64b90710b8e664 100644
--- a/Include/internal/pycore_tstate.h
+++ b/Include/internal/pycore_tstate.h
@@ -12,7 +12,6 @@ extern "C" {
 #include "pycore_freelist_state.h"  // struct _Py_freelists
 #include "pycore_interpframe_structs.h"  // _PyInterpreterFrame
 #include "pycore_mimalloc.h"        // struct _mimalloc_thread_state
-#include "pycore_optimizer_types.h" // JitOptContext
 #include "pycore_qsbr.h"            // struct qsbr
 #include "pycore_uop.h"             // struct _PyUOpInstruction
 #include "pycore_structs.h"
@@ -24,46 +23,6 @@ struct _gc_thread_state {
 };
 #endif
 
-#if _Py_TIER2
-typedef struct _PyJitTracerInitialState {
-    int stack_depth;
-    int chain_depth;
-    struct _PyExitData *exit;
-    PyCodeObject *code; // Strong
-    PyFunctionObject *func; // Strong
-    struct _PyExecutorObject *executor; // Strong
-    _Py_CODEUNIT *start_instr;
-    _Py_CODEUNIT *close_loop_instr;
-    _Py_CODEUNIT *jump_backward_instr;
-} _PyJitTracerInitialState;
-
-typedef struct _PyJitTracerPreviousState {
-    bool dependencies_still_valid;
-    int code_max_size;
-    int code_curr_size;
-    int instr_oparg;
-    int instr_stacklevel;
-    _Py_CODEUNIT *instr;
-    PyCodeObject *instr_code; // Strong
-    struct _PyInterpreterFrame *instr_frame;
-    _PyBloomFilter dependencies;
-} _PyJitTracerPreviousState;
-
-typedef struct _PyJitTracerTranslatorState {
-    int jump_backward_seen;
-} _PyJitTracerTranslatorState;
-
-typedef struct _PyJitTracerState {
-    bool is_tracing;
-    _PyJitTracerInitialState initial_state;
-    _PyJitTracerPreviousState prev_state;
-    _PyJitTracerTranslatorState translator_state;
-    JitOptContext opt_context;
-    _PyUOpInstruction code_buffer[UOP_MAX_TRACE_LENGTH];
-    _PyUOpInstruction out_buffer[UOP_MAX_TRACE_LENGTH];
-} _PyJitTracerState;
-
-#endif
 
 // Every PyThreadState is actually allocated as a _PyThreadStateImpl. The
 // PyThreadState fields are exposed as part of the C API, although most fields
@@ -141,7 +100,7 @@ typedef struct _PyThreadStateImpl {
     Py_ssize_t reftotal;  // this thread's total refcount operations
 #endif
 #if _Py_TIER2
-    _PyJitTracerState *jit_tracer_state;
+    struct _PyJitTracerState *jit_tracer_state;
 #endif
 } _PyThreadStateImpl;
 
diff --git a/Include/internal/pycore_uop.h b/Include/internal/pycore_uop.h
index e828a1cc5a5722..f9be01acb57197 100644
--- a/Include/internal/pycore_uop.h
+++ b/Include/internal/pycore_uop.h
@@ -38,11 +38,10 @@ typedef struct _PyUOpInstruction{
 // This is the length of the trace we translate initially.
 #ifdef Py_DEBUG
     // With asserts, the stencils are a lot larger
-#define UOP_MAX_TRACE_LENGTH 2000
+#define UOP_MAX_TRACE_LENGTH 1000
 #else
-#define UOP_MAX_TRACE_LENGTH 5000
+#define UOP_MAX_TRACE_LENGTH 2500
 #endif
-#define UOP_BUFFER_SIZE (UOP_MAX_TRACE_LENGTH * sizeof(_PyUOpInstruction))
 
 /* Bloom filter with m = 256
  * https://en.wikipedia.org/wiki/Bloom_filter */
diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h
index 3b4b3253b3638c..d791ba0e8eca97 100644
--- a/Python/ceval_macros.h
+++ b/Python/ceval_macros.h
@@ -433,7 +433,7 @@ do {                                                   \
         JUMP_TO_LABEL(error);                          \
     }                                                  \
     if (keep_tracing_bit) { \
-        assert(((_PyThreadStateImpl 
*)tstate)->jit_tracer_state->prev_state.code_curr_size == 2); \
+        assert(uop_buffer_length(&((_PyThreadStateImpl 
*)tstate)->jit_tracer_state->code_buffer)); \
         ENTER_TRACING(); \
         DISPATCH_NON_TRACING(); \
     } \
diff --git a/Python/optimizer.c b/Python/optimizer.c
index 15a1eb5a17745b..f25242972efeb1 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -188,9 +188,6 @@ _PyOptimizer_Optimize(
         }
         insert_executor(code, start, index, executor);
     }
-    else {
-        executor->vm_data.code = NULL;
-    }
     executor->vm_data.chain_depth = chain_depth;
     assert(executor->vm_data.valid);
     _PyExitData *exit = _tstate->jit_tracer_state->initial_state.exit;
@@ -547,52 +544,43 @@ guard_ip_uop[MAX_UOP_ID + 1] = {
 #endif
 
 
-static inline int
+static inline void
 add_to_trace(
-    _PyUOpInstruction *trace,
-    int trace_length,
+    _PyJitUopBuffer *trace,
     uint16_t opcode,
     uint16_t oparg,
     uint64_t operand,
     uint32_t target)
 {
-    trace[trace_length].opcode = opcode;
-    trace[trace_length].format = UOP_FORMAT_TARGET;
-    trace[trace_length].target = target;
-    trace[trace_length].oparg = oparg;
-    trace[trace_length].operand0 = operand;
+    _PyUOpInstruction *inst = trace->next;
+    inst->opcode = opcode;
+    inst->format = UOP_FORMAT_TARGET;
+    inst->target = target;
+    inst->oparg = oparg;
+    inst->operand0 = operand;
 #ifdef Py_STATS
-    trace[trace_length].execution_count = 0;
+    inst->execution_count = 0;
 #endif
-    return trace_length + 1;
+    trace->next++;
 }
 
+
 #ifdef Py_DEBUG
 #define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
-    assert(trace_length < max_length); \
-    trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG), 
(OPERAND), (TARGET)); \
+    add_to_trace(trace, (OPCODE), (OPARG), (OPERAND), (TARGET)); \
     if (lltrace >= 2) { \
-        printf("%4d ADD_TO_TRACE: ", trace_length); \
-        _PyUOpPrint(&trace[trace_length-1]); \
+        printf("%4d ADD_TO_TRACE: ", uop_buffer_length(trace)); \
+        _PyUOpPrint(uop_buffer_last(trace)); \
         printf("\n"); \
     }
 #else
 #define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
-    assert(trace_length < max_length); \
-    trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG), 
(OPERAND), (TARGET));
+    add_to_trace(trace, (OPCODE), (OPARG), (OPERAND), (TARGET))
 #endif
 
 #define INSTR_IP(INSTR, CODE) \
     ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))
 
-// Reserve space for n uops
-#define RESERVE_RAW(n, opname) \
-    if (trace_length + (n) > max_length) { \
-        DPRINTF(2, "No room for %s (need %d, got %d)\n", \
-                (opname), (n), max_length - trace_length); \
-        OPT_STAT_INC(trace_too_long); \
-        goto full; \
-    }
 
 static int
 is_terminator(const _PyUOpInstruction *uop)
@@ -629,9 +617,7 @@ _PyJit_translate_single_bytecode_to_trace(
     PyCodeObject *old_code = tracer->prev_state.instr_code;
     bool progress_needed = (tracer->initial_state.chain_depth % 
MAX_CHAIN_DEPTH) == 0;
     _PyBloomFilter *dependencies = &tracer->prev_state.dependencies;
-    int trace_length = tracer->prev_state.code_curr_size;
-    _PyUOpInstruction *trace = tracer->code_buffer;
-    int max_length = tracer->prev_state.code_max_size;
+    _PyJitUopBuffer *trace = &tracer->code_buffer;
 
     _Py_CODEUNIT *this_instr =  tracer->prev_state.instr;
     _Py_CODEUNIT *target_instr = this_instr;
@@ -670,15 +656,13 @@ _PyJit_translate_single_bytecode_to_trace(
         }
     }
 
-    int old_stack_level = tracer->prev_state.instr_stacklevel;
-
     // Strange control-flow
     bool has_dynamic_jump_taken = OPCODE_HAS_UNPREDICTABLE_JUMP(opcode) &&
         (next_instr != this_instr + 1 + 
_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]);
 
     /* Special case the first instruction,
     * so that we can guarantee forward progress */
-    if (progress_needed && tracer->prev_state.code_curr_size < 
CODE_SIZE_NO_PROGRESS) {
+    if (progress_needed && uop_buffer_length(&tracer->code_buffer) < 
CODE_SIZE_NO_PROGRESS) {
         if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) {
             opcode = _PyOpcode_Deopt[opcode];
         }
@@ -694,7 +678,7 @@ _PyJit_translate_single_bytecode_to_trace(
 
     int is_sys_tracing = (tstate->c_tracefunc != NULL) || 
(tstate->c_profilefunc != NULL);
     if (is_sys_tracing) {
-        goto full;
+        goto done;
     }
 
     if (stop_tracing_opcode == _DEOPT) {
@@ -710,7 +694,7 @@ _PyJit_translate_single_bytecode_to_trace(
         goto done;
     }
 
-    DPRINTF(2, "%p %d: %s(%d) %d %d\n", old_code, target, 
_PyOpcode_OpName[opcode], oparg, needs_guard_ip, old_stack_level);
+    DPRINTF(2, "%p %d: %s(%d) %d\n", old_code, target, 
_PyOpcode_OpName[opcode], oparg, needs_guard_ip);
 
 #ifdef Py_DEBUG
     if (oparg > 255) {
@@ -719,7 +703,7 @@ _PyJit_translate_single_bytecode_to_trace(
 #endif
 
     if (!tracer->prev_state.dependencies_still_valid) {
-        goto full;
+        goto done;
     }
 
     // This happens when a recursive call happens that we can't trace. Such as 
Python -> C -> Python calls
@@ -734,16 +718,14 @@ _PyJit_translate_single_bytecode_to_trace(
         unsupported:
         {
             // Rewind to previous instruction and replace with _EXIT_TRACE.
-            _PyUOpInstruction *curr = &trace[trace_length-1];
-            while (curr->opcode != _SET_IP && trace_length > 2) {
-                trace_length--;
-                curr = &trace[trace_length-1];
+            _PyUOpInstruction *curr = uop_buffer_last(trace);
+            while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) {
+                trace->next--;
+                curr = uop_buffer_last(trace);
             }
-            assert(curr->opcode == _SET_IP || trace_length == 2);
+            assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2);
             if (curr->opcode == _SET_IP) {
                 int32_t old_target = (int32_t)uop_get_target(curr);
-                curr++;
-                trace_length++;
                 curr->opcode = _DEOPT;
                 curr->format = UOP_FORMAT_TARGET;
                 curr->target = old_target;
@@ -752,7 +734,6 @@ _PyJit_translate_single_bytecode_to_trace(
         }
     }
 
-
     if (opcode == NOP) {
         return 1;
     }
@@ -766,7 +747,7 @@ _PyJit_translate_single_bytecode_to_trace(
     }
 
     // One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
-    max_length -= 2;
+    trace->end -= 2;
 
     const struct opcode_macro_expansion *expansion = 
&_PyOpcode_macro_expansion[opcode];
 
@@ -775,18 +756,28 @@ _PyJit_translate_single_bytecode_to_trace(
 
 
     if (OPCODE_HAS_EXIT(opcode)) {
-        // Make space for side exit and final _EXIT_TRACE:
-        max_length--;
+        // Make space for side exit
+        trace->end--;
     }
     if (OPCODE_HAS_ERROR(opcode)) {
-        // Make space for error stub and final _EXIT_TRACE:
-        max_length--;
+        // Make space for error stub
+        trace->end--;
+    }
+    if (OPCODE_HAS_DEOPT(opcode)) {
+        // Make space for side exit
+        trace->end--;
     }
 
     // _GUARD_IP leads to an exit.
-    max_length -= needs_guard_ip;
+    trace->end -= needs_guard_ip;
 
-    RESERVE_RAW(expansion->nuops + needs_guard_ip + 2 + 
(!OPCODE_HAS_NO_SAVE_IP(opcode)), "uop and various checks");
+    int space_needed = expansion->nuops + needs_guard_ip + 2 + 
(!OPCODE_HAS_NO_SAVE_IP(opcode));
+    if (uop_buffer_remaining_space(trace) < space_needed) {
+        DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n",
+                space_needed, uop_buffer_remaining_space(trace));
+        OPT_STAT_INC(trace_too_long);
+        goto done;
+    }
 
     ADD_TO_TRACE(_CHECK_VALIDITY, 0, 0, target);
 
@@ -825,7 +816,7 @@ _PyJit_translate_single_bytecode_to_trace(
         {
             if ((next_instr != tracer->initial_state.close_loop_instr) &&
                 (next_instr != tracer->initial_state.start_instr) &&
-                tracer->prev_state.code_curr_size > CODE_SIZE_NO_PROGRESS &&
+                uop_buffer_length(&tracer->code_buffer) > 
CODE_SIZE_NO_PROGRESS &&
                 // For side exits, we don't want to terminate them early.
                 tracer->initial_state.exit == NULL &&
                 // These are coroutines, and we want to unroll those usually.
@@ -836,7 +827,7 @@ _PyJit_translate_single_bytecode_to_trace(
                 // inner loop might start and let the traces rejoin.
                 OPT_STAT_INC(inner_loop);
                 ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
-                trace[trace_length-1].operand1 = true; // is_control_flow
+                uop_buffer_last(trace)->operand1 = true; // is_control_flow
                 DPRINTF(2, "JUMP_BACKWARD not to top ends trace %p %p %p\n", 
next_instr,
                     tracer->initial_state.close_loop_instr, 
tracer->initial_state.start_instr);
                 goto done;
@@ -913,19 +904,19 @@ _PyJit_translate_single_bytecode_to_trace(
                         }
                         break;
                     case OPERAND1_1:
-                        assert(trace[trace_length-1].opcode == uop);
+                        assert(uop_buffer_last(trace)->opcode == uop);
                         operand = read_u16(&this_instr[offset].cache);
-                        trace[trace_length-1].operand1 = operand;
+                        uop_buffer_last(trace)->operand1 = operand;
                         continue;
                     case OPERAND1_2:
-                        assert(trace[trace_length-1].opcode == uop);
+                        assert(uop_buffer_last(trace)->opcode == uop);
                         operand = read_u32(&this_instr[offset].cache);
-                        trace[trace_length-1].operand1 = operand;
+                        uop_buffer_last(trace)->operand1 = operand;
                         continue;
                     case OPERAND1_4:
-                        assert(trace[trace_length-1].opcode == uop);
+                        assert(uop_buffer_last(trace)->opcode == uop);
                         operand = read_u64(&this_instr[offset].cache);
-                        trace[trace_length-1].operand1 = operand;
+                        uop_buffer_last(trace)->operand1 = operand;
                         continue;
                     default:
                         fprintf(stderr,
@@ -955,7 +946,7 @@ _PyJit_translate_single_bytecode_to_trace(
                         }
                     }
                     ADD_TO_TRACE(uop, oparg, operand, target);
-                    trace[trace_length - 1].operand1 = 
PyStackRef_IsNone(frame->f_executable) ? 2 : ((int)(frame->stackpointer - 
_PyFrame_Stackbase(frame)));
+                    uop_buffer_last(trace)->operand1 = 
PyStackRef_IsNone(frame->f_executable) ? 2 : ((int)(frame->stackpointer - 
_PyFrame_Stackbase(frame)));
                     break;
                 }
                 if (uop == _BINARY_OP_INPLACE_ADD_UNICODE) {
@@ -973,9 +964,9 @@ _PyJit_translate_single_bytecode_to_trace(
     }  // End switch (opcode)
 
     if (needs_guard_ip) {
-        uint16_t guard_ip = guard_ip_uop[trace[trace_length-1].opcode];
+        uint16_t guard_ip = guard_ip_uop[uop_buffer_last(trace)->opcode];
         if (guard_ip == 0) {
-            DPRINTF(1, "Unknown uop needing guard ip %s\n", 
_PyOpcode_uop_name[trace[trace_length-1].opcode]);
+            DPRINTF(1, "Unknown uop needing guard ip %s\n", 
_PyOpcode_uop_name[uop_buffer_last(trace)->opcode]);
             Py_UNREACHABLE();
         }
         ADD_TO_TRACE(guard_ip, 0, (uintptr_t)next_instr, 0);
@@ -983,7 +974,7 @@ _PyJit_translate_single_bytecode_to_trace(
     // Loop back to the start
     int is_first_instr = tracer->initial_state.close_loop_instr == next_instr 
||
         tracer->initial_state.start_instr == next_instr;
-    if (is_first_instr && tracer->prev_state.code_curr_size > 
CODE_SIZE_NO_PROGRESS) {
+    if (is_first_instr && uop_buffer_length(trace) > CODE_SIZE_NO_PROGRESS) {
         if (needs_guard_ip) {
             ADD_TO_TRACE(_SET_IP, 0, (uintptr_t)next_instr, 0);
         }
@@ -991,27 +982,13 @@ _PyJit_translate_single_bytecode_to_trace(
         goto done;
     }
     DPRINTF(2, "Trace continuing\n");
-    tracer->prev_state.code_curr_size = trace_length;
-    tracer->prev_state.code_max_size = max_length;
     return 1;
 done:
     DPRINTF(2, "Trace done\n");
-    tracer->prev_state.code_curr_size = trace_length;
-    tracer->prev_state.code_max_size = max_length;
-    return 0;
-full:
-    DPRINTF(2, "Trace full\n");
-    if (!is_terminator(&tracer->code_buffer[trace_length-1])) {
-        // Undo the last few instructions.
-        trace_length = tracer->prev_state.code_curr_size;
-        max_length = tracer->prev_state.code_max_size;
-        // We previously reversed one.
-        max_length += 1;
+    if (!is_terminator(uop_buffer_last(trace))) {
         ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
-        trace[trace_length-1].operand1 = true; // is_control_flow
+        uop_buffer_last(trace)->operand1 = true; // is_control_flow
     }
-    tracer->prev_state.code_curr_size = trace_length;
-    tracer->prev_state.code_max_size = max_length;
     return 0;
 }
 
@@ -1059,11 +1036,12 @@ _PyJit_TryInitializeTracing(
         2 * INSTR_IP(close_loop_instr, code),
         chain_depth);
 #endif
-    add_to_trace(tracer->code_buffer, 0, _START_EXECUTOR, 0, 
(uintptr_t)start_instr, INSTR_IP(start_instr, code));
-    add_to_trace(tracer->code_buffer, 1, _MAKE_WARM, 0, 0, 0);
-    tracer->prev_state.code_curr_size = CODE_SIZE_EMPTY;
+    /* Set up tracing buffer*/
+    _PyJitUopBuffer *trace = &tracer->code_buffer;
+    uop_buffer_init(trace, &tracer->uop_array[0], UOP_MAX_TRACE_LENGTH);
+    ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)start_instr, 
INSTR_IP(start_instr, code));
+    ADD_TO_TRACE(_MAKE_WARM, 0, 0, 0);
 
-    tracer->prev_state.code_max_size = UOP_MAX_TRACE_LENGTH/2;
     tracer->initial_state.start_instr = start_instr;
     tracer->initial_state.close_loop_instr = close_loop_instr;
     tracer->initial_state.code = (PyCodeObject *)Py_NewRef(code);
@@ -1122,8 +1100,7 @@ _PyJit_FinalizeTracing(PyThreadState *tstate, int err)
     Py_CLEAR(tracer->initial_state.func);
     Py_CLEAR(tracer->initial_state.executor);
     Py_CLEAR(tracer->prev_state.instr_code);
-    tracer->prev_state.code_curr_size = CODE_SIZE_EMPTY;
-    tracer->prev_state.code_max_size = UOP_MAX_TRACE_LENGTH/2 - 1;
+    uop_buffer_init(&tracer->code_buffer, &tracer->uop_array[0], 
UOP_MAX_TRACE_LENGTH);
     tracer->is_tracing = false;
 }
 
@@ -1137,7 +1114,6 @@ _PyJit_TracerFree(_PyThreadStateImpl *_tstate)
 }
 
 #undef RESERVE
-#undef RESERVE_RAW
 #undef INSTR_IP
 #undef ADD_TO_TRACE
 #undef DPRINTF
@@ -1467,39 +1443,47 @@ int effective_trace_length(_PyUOpInstruction *buffer, 
int length)
 
 
 static int
-stack_allocate(_PyUOpInstruction *buffer, int length)
+stack_allocate(_PyUOpInstruction *buffer, _PyUOpInstruction *output, int 
length)
 {
     assert(buffer[0].opcode == _START_EXECUTOR);
-    for (int i = length-1; i >= 0; i--) {
-        buffer[i*2+1] = buffer[i];
-        buffer[i*2].format = UOP_FORMAT_TARGET;
-        buffer[i*2].oparg = 0;
-        buffer[i*2].target = 0;
+    /* The input buffer and output buffers will overlap.
+       Make sure that we can move instructions to the output
+       without overwriting the input. */
+    if (buffer == output) {
+        // This can only happen if optimizer has not been run
+        for (int i = 0; i < length; i++) {
+            buffer[i + UOP_MAX_TRACE_LENGTH] = buffer[i];
+        }
+        buffer += UOP_MAX_TRACE_LENGTH;
+    }
+    else {
+        assert(output + UOP_MAX_TRACE_LENGTH == buffer);
     }
     int depth = 0;
+    _PyUOpInstruction *write = output;
     for (int i = 0; i < length; i++) {
-        _PyUOpInstruction *spill_or_reload = &buffer[i*2];
-        int uop = buffer[i*2+1].opcode;
+        int uop = buffer[i].opcode;
         if (uop == _NOP) {
-            // leave _NOPs to be cleaned up later
-            spill_or_reload->opcode = _NOP;
             continue;
         }
         int new_depth = _PyUop_Caching[uop].best[depth];
-        if (new_depth == depth) {
-            spill_or_reload->opcode = _NOP;
-        }
-        else {
-            spill_or_reload->opcode = 
_PyUop_SpillsAndReloads[depth][new_depth];
+        if (new_depth != depth) {
+            write->opcode = _PyUop_SpillsAndReloads[depth][new_depth];
+            assert(write->opcode != 0);
+            write->format = UOP_FORMAT_TARGET;
+            write->oparg = 0;
+            write->target = 0;
+            write++;
             depth = new_depth;
         }
+        *write = buffer[i];
         uint16_t new_opcode = _PyUop_Caching[uop].entries[depth].opcode;
         assert(new_opcode != 0);
-        assert(spill_or_reload->opcode != 0);
-        buffer[i*2+1].opcode = new_opcode;
+        write->opcode = new_opcode;
+        write++;
         depth = _PyUop_Caching[uop].entries[depth].output;
     }
-    return length*2;
+    return write - output;
 }
 
 static int
@@ -1512,28 +1496,28 @@ uop_optimize(
     _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
     assert(_tstate->jit_tracer_state != NULL);
     _PyBloomFilter *dependencies = 
&_tstate->jit_tracer_state->prev_state.dependencies;
-    _PyUOpInstruction *buffer = _tstate->jit_tracer_state->code_buffer;
+    _PyUOpInstruction *buffer = _tstate->jit_tracer_state->code_buffer.start;
     OPT_STAT_INC(attempts);
     bool is_noopt = !tstate->interp->opt_config.uops_optimize_enabled;
     int curr_stackentries = 
_tstate->jit_tracer_state->initial_state.stack_depth;
-    int length = _tstate->jit_tracer_state->prev_state.code_curr_size;
+    int length = uop_buffer_length(&_tstate->jit_tracer_state->code_buffer);
     if (length <= CODE_SIZE_NO_PROGRESS) {
         return 0;
     }
     assert(length > 0);
-    assert(length < UOP_MAX_TRACE_LENGTH/2);
+    assert(length < UOP_MAX_TRACE_LENGTH);
     OPT_STAT_INC(traces_created);
     if (!is_noopt) {
+        _PyUOpInstruction *output = 
&_tstate->jit_tracer_state->uop_array[UOP_MAX_TRACE_LENGTH];
         length = _Py_uop_analyze_and_optimize(
-            _tstate,
-            buffer, length,
-            curr_stackentries, dependencies);
+            _tstate, buffer, length, curr_stackentries,
+            output, dependencies);
         if (length <= 0) {
             return length;
         }
-        buffer = _tstate->jit_tracer_state->out_buffer;
+        buffer = output;
     }
-    assert(length < UOP_MAX_TRACE_LENGTH/2);
+    assert(length < UOP_MAX_TRACE_LENGTH);
     assert(length >= 1);
     /* Fix up */
     for (int pc = 0; pc < length; pc++) {
@@ -1549,7 +1533,9 @@ uop_optimize(
         assert(_PyOpcode_uop_name[buffer[pc].opcode]);
     }
     OPT_HIST(effective_trace_length(buffer, length), 
optimized_trace_length_hist);
-    length = stack_allocate(buffer, length);
+    _PyUOpInstruction *output = &_tstate->jit_tracer_state->uop_array[0];
+    length = stack_allocate(buffer, output, length);
+    buffer = output;
     length = prepare_for_execution(buffer, length);
     assert(length <= UOP_MAX_TRACE_LENGTH);
     _PyExecutorObject *executor = make_executor_from_uops(
@@ -1707,6 +1693,7 @@ _Py_ExecutorInit(_PyExecutorObject *executor, const 
_PyBloomFilter *dependency_s
 {
     executor->vm_data.valid = true;
     executor->vm_data.pending_deletion = 0;
+    executor->vm_data.code = NULL;
     for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) {
         executor->vm_data.bloom.bits[i] = dependency_set->bits[i];
     }
diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c
index e4e259a81b510f..c6a1ae60a317fa 100644
--- a/Python/optimizer_analysis.c
+++ b/Python/optimizer_analysis.c
@@ -203,14 +203,14 @@ static inline void
 add_op(JitOptContext *ctx, _PyUOpInstruction *this_instr,
        uint16_t opcode, uint16_t oparg, uintptr_t operand0)
 {
-    _PyUOpInstruction *out = &ctx->out_buffer[ctx->out_len];
+    _PyUOpInstruction *out = ctx->out_buffer.next;
     out->opcode = (opcode);
     out->format = this_instr->format;
     out->oparg = (oparg);
     out->target = this_instr->target;
     out->operand0 = (operand0);
     out->operand1 = this_instr->operand1;
-    ctx->out_len++;
+    ctx->out_buffer.next++;
 }
 
 /* Shortened forms for convenience, used in optimizer_bytecodes.c */
@@ -430,6 +430,7 @@ optimize_uops(
     _PyUOpInstruction *trace,
     int trace_len,
     int curr_stacklen,
+    _PyUOpInstruction *output,
     _PyBloomFilter *dependencies
 )
 {
@@ -440,7 +441,7 @@ optimize_uops(
     JitOptContext *ctx = &tstate->jit_tracer_state->opt_context;
     uint32_t opcode = UINT16_MAX;
 
-    ctx->out_buffer = tstate->jit_tracer_state->out_buffer;
+    uop_buffer_init(&ctx->out_buffer, output, UOP_MAX_TRACE_LENGTH);
 
     // Make sure that watchers are set up
     PyInterpreterState *interp = _PyInterpreterState_GET();
@@ -458,14 +459,20 @@ optimize_uops(
     ctx->curr_frame_depth++;
     ctx->frame = frame;
 
-    ctx->out_len = 0;
-
     _PyUOpInstruction *this_instr = NULL;
     JitOptRef *stack_pointer = ctx->frame->stack_pointer;
 
-    for (int i = 0; !ctx->done; i++) {
-        assert(i < trace_len);
+    for (int i = 0; i < trace_len; i++) {
         this_instr = &trace[i];
+        if (ctx->done) {
+            // Don't do any more optimization, but
+            // we still need to reach a terminator for corrctness.
+            *(ctx->out_buffer.next++) = *this_instr;
+            if (is_terminator_uop(this_instr)) {
+                break;
+            }
+            continue;
+        }
 
         int oparg = this_instr->oparg;
         opcode = this_instr->opcode;
@@ -485,6 +492,8 @@ optimize_uops(
         }
 #endif
 
+        _PyUOpInstruction *out_ptr = ctx->out_buffer.next;
+
         switch (opcode) {
 
 #include "optimizer_cases.c.h"
@@ -494,8 +503,8 @@ optimize_uops(
                 Py_UNREACHABLE();
         }
         // If no ADD_OP was called during this iteration, copy the original 
instruction
-        if (ctx->out_len == i) {
-            ctx->out_buffer[ctx->out_len++] = *this_instr;
+        if (ctx->out_buffer.next == out_ptr) {
+            *(ctx->out_buffer.next++) = *this_instr;
         }
         assert(ctx->frame != NULL);
         if (!CURRENT_FRAME_IS_INIT_SHIM()) {
@@ -526,20 +535,11 @@ optimize_uops(
      * would be no benefit in retrying later */
     _Py_uop_abstractcontext_fini(ctx);
     // Check that the trace ends with a proper terminator
-    if (ctx->out_len > 0) {
-        _PyUOpInstruction *last_uop = &ctx->out_buffer[ctx->out_len - 1];
-        if (!is_terminator_uop(last_uop)) {
-            // Copy remaining uops from original trace until we find a 
terminator
-            for (int i = ctx->out_len; i < trace_len; i++) {
-                ctx->out_buffer[ctx->out_len++] = trace[i];
-                if (is_terminator_uop(&trace[i])) {
-                    break;
-                }
-            }
-        }
+    if (uop_buffer_length(&ctx->out_buffer) > 0) {
+        assert(is_terminator_uop(uop_buffer_last(&ctx->out_buffer)));
     }
 
-    return ctx->out_len;
+    return uop_buffer_length(&ctx->out_buffer);
 
 error:
     DPRINTF(3, "\n");
@@ -696,14 +696,15 @@ _Py_uop_analyze_and_optimize(
     _PyUOpInstruction *buffer,
     int length,
     int curr_stacklen,
+    _PyUOpInstruction *output,
     _PyBloomFilter *dependencies
 )
 {
     OPT_STAT_INC(optimizer_attempts);
 
     length = optimize_uops(
-         tstate, buffer,
-         length, curr_stacklen, dependencies);
+         tstate, buffer, length, curr_stacklen,
+         output, dependencies);
 
     if (length == 0) {
         return length;
@@ -711,7 +712,7 @@ _Py_uop_analyze_and_optimize(
 
     assert(length > 0);
 
-    length = remove_unneeded_uops(tstate->jit_tracer_state->out_buffer, 
length);
+    length = remove_unneeded_uops(output, length);
     assert(length > 0);
 
     OPT_STAT_INC(optimizer_successes);
diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c
index 0ccc788dff962d..1584e731d1b2d4 100644
--- a/Python/optimizer_bytecodes.c
+++ b/Python/optimizer_bytecodes.c
@@ -194,7 +194,6 @@ dummy_func(void) {
                     _Py_BloomFilter_Add(dependencies, type);
                 }
             }
-
         }
     }
 
@@ -798,7 +797,7 @@ dummy_func(void) {
         if (sym_is_const(ctx, callable) && sym_matches_type(callable, 
&PyFunction_Type)) {
             assert(PyFunction_Check(sym_get_const(ctx, callable)));
             ADD_OP(_CHECK_FUNCTION_VERSION_INLINE, 0, func_version);
-            ctx->out_buffer[ctx->out_len - 1].operand1 = 
(uintptr_t)sym_get_const(ctx, callable);
+            uop_buffer_last(&ctx->out_buffer)->operand1 = 
(uintptr_t)sym_get_const(ctx, callable);
         }
         sym_set_type(callable, &PyFunction_Type);
     }
@@ -808,7 +807,7 @@ dummy_func(void) {
             PyMethodObject *method = (PyMethodObject *)sym_get_const(ctx, 
callable);
             assert(PyMethod_Check(method));
             ADD_OP(_CHECK_FUNCTION_VERSION_INLINE, 0, func_version);
-            ctx->out_buffer[ctx->out_len - 1].operand1 = 
(uintptr_t)method->im_func;
+            uop_buffer_last(&ctx->out_buffer)->operand1 = 
(uintptr_t)method->im_func;
         }
         sym_set_type(callable, &PyMethod_Type);
     }
@@ -1570,7 +1569,7 @@ dummy_func(void) {
                     ctx->frame->globals_watched = true;
                 }
                 if (ctx->frame->globals_checked_version != version && 
this_instr[-1].opcode == _NOP) {
-                    REPLACE_OP(&ctx->out_buffer[ctx->out_len - 1], 
_GUARD_GLOBALS_VERSION, 0, version);
+                    REPLACE_OP(uop_buffer_last(&ctx->out_buffer), 
_GUARD_GLOBALS_VERSION, 0, version);
                     ctx->frame->globals_checked_version = version;
                 }
                 if (ctx->frame->globals_checked_version == version) {
diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h
index f62e15b987c0eb..341805d51e24cd 100644
--- a/Python/optimizer_cases.c.h
+++ b/Python/optimizer_cases.c.h
@@ -1557,7 +1557,7 @@
                         ctx->frame->globals_watched = true;
                     }
                     if (ctx->frame->globals_checked_version != version && 
this_instr[-1].opcode == _NOP) {
-                        REPLACE_OP(&ctx->out_buffer[ctx->out_len - 1], 
_GUARD_GLOBALS_VERSION, 0, version);
+                        REPLACE_OP(uop_buffer_last(&ctx->out_buffer), 
_GUARD_GLOBALS_VERSION, 0, version);
                         ctx->frame->globals_checked_version = version;
                     }
                     if (ctx->frame->globals_checked_version == version) {
@@ -2861,7 +2861,7 @@
             if (sym_is_const(ctx, callable) && sym_matches_type(callable, 
&PyFunction_Type)) {
                 assert(PyFunction_Check(sym_get_const(ctx, callable)));
                 ADD_OP(_CHECK_FUNCTION_VERSION_INLINE, 0, func_version);
-                ctx->out_buffer[ctx->out_len - 1].operand1 = 
(uintptr_t)sym_get_const(ctx, callable);
+                uop_buffer_last(&ctx->out_buffer)->operand1 = 
(uintptr_t)sym_get_const(ctx, callable);
             }
             sym_set_type(callable, &PyFunction_Type);
             break;
@@ -2879,7 +2879,7 @@
                 PyMethodObject *method = (PyMethodObject *)sym_get_const(ctx, 
callable);
                 assert(PyMethod_Check(method));
                 ADD_OP(_CHECK_FUNCTION_VERSION_INLINE, 0, func_version);
-                ctx->out_buffer[ctx->out_len - 1].operand1 = 
(uintptr_t)method->im_func;
+                uop_buffer_last(&ctx->out_buffer)->operand1 = 
(uintptr_t)method->im_func;
             }
             sym_set_type(callable, &PyMethod_Type);
             break;
diff --git a/Python/pystate.c b/Python/pystate.c
index 89374e16722494..19f1245d60a2f8 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -24,7 +24,6 @@
 #include "pycore_stackref.h"      // Py_STACKREF_DEBUG
 #include "pycore_stats.h"         // FT_STAT_WORLD_STOP_INC()
 #include "pycore_time.h"          // _PyTime_Init()
-#include "pycore_uop.h"           // UOP_BUFFER_SIZE
 #include "pycore_uniqueid.h"      // _PyObject_FinalizePerThreadRefcounts()
 
 

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]

Reply via email to