https://github.com/python/cpython/commit/65f12370982b9982b204d07f9f26ca8740f21845
commit: 65f12370982b9982b204d07f9f26ca8740f21845
branch: main
author: Savannah Ostrowski <savannahostrow...@gmail.com>
committer: brandtbucher <brandtbuc...@gmail.com>
date: 2024-09-27T00:35:42Z
summary:

GH-123516: Improve JIT memory consumption by invalidating cold executors 
(GH-124443)

Co-authored-by: Bénédikt Tran <10796600+picn...@users.noreply.github.com>

files:
A 
Misc/NEWS.d/next/Core_and_Builtins/2024-08-27-21-44-14.gh-issue-116017.ZY3yBY.rst
M Include/internal/pycore_ceval.h
M Include/internal/pycore_interp.h
M Include/internal/pycore_optimizer.h
M Include/internal/pycore_uop_ids.h
M Include/internal/pycore_uop_metadata.h
M Python/bytecodes.c
M Python/ceval_gil.c
M Python/executor_cases.c.h
M Python/optimizer.c
M Python/optimizer_cases.c.h
M Python/pystate.c
M Tools/cases_generator/analyzer.py
M Tools/jit/_targets.py

diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index a97b53028c8f59..363845106e40dc 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -283,6 +283,7 @@ PyAPI_FUNC(PyObject *) _PyEval_LoadName(PyThreadState 
*tstate, _PyInterpreterFra
 #define _PY_GC_SCHEDULED_BIT (1U << 4)
 #define _PY_EVAL_PLEASE_STOP_BIT (1U << 5)
 #define _PY_EVAL_EXPLICIT_MERGE_BIT (1U << 6)
+#define _PY_EVAL_JIT_INVALIDATE_COLD_BIT (1U << 7)
 
 /* Reserve a few bits for future use */
 #define _PY_EVAL_EVENTS_BITS 8
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
index 36366429e8db25..a1898d926ac39f 100644
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@@ -261,7 +261,7 @@ struct _is {
     struct callable_cache callable_cache;
     _PyOptimizerObject *optimizer;
     _PyExecutorObject *executor_list_head;
-
+    size_t trace_run_counter;
     _rare_events rare_events;
     PyDict_WatchCallback builtins_dict_watcher;
 
diff --git a/Include/internal/pycore_optimizer.h 
b/Include/internal/pycore_optimizer.h
index 19e54bf122a8bb..f92c0a0cddf906 100644
--- a/Include/internal/pycore_optimizer.h
+++ b/Include/internal/pycore_optimizer.h
@@ -29,9 +29,10 @@ typedef struct {
 typedef struct {
     uint8_t opcode;
     uint8_t oparg;
-    uint16_t valid:1;
-    uint16_t linked:1;
-    uint16_t chain_depth:14;  // Must be big engough for MAX_CHAIN_DEPTH - 1.
+    uint8_t valid:1;
+    uint8_t linked:1;
+    uint8_t chain_depth:6;  // Must be big enough for MAX_CHAIN_DEPTH - 1.
+    bool warm;
     int index;           // Index of ENTER_EXECUTOR (if code isn't NULL, 
below).
     _PyBloomFilter bloom;
     _PyExecutorLinkListNode links;
@@ -123,11 +124,18 @@ PyAPI_FUNC(PyObject *) _PyOptimizer_NewUOpOptimizer(void);
 #ifdef _Py_TIER2
 PyAPI_FUNC(void) _Py_Executors_InvalidateDependency(PyInterpreterState 
*interp, void *obj, int is_invalidation);
 PyAPI_FUNC(void) _Py_Executors_InvalidateAll(PyInterpreterState *interp, int 
is_invalidation);
+PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyInterpreterState *interp);
+
 #else
 #  define _Py_Executors_InvalidateDependency(A, B, C) ((void)0)
 #  define _Py_Executors_InvalidateAll(A, B) ((void)0)
+#  define _Py_Executors_InvalidateCold(A) ((void)0)
+
 #endif
 
+// Used as the threshold to trigger executor invalidation when
+// trace_run_counter is greater than this value.
+#define JIT_CLEANUP_THRESHOLD 100000
 
 // This is the length of the trace we project initially.
 #define UOP_MAX_TRACE_LENGTH 800
diff --git a/Include/internal/pycore_uop_ids.h 
b/Include/internal/pycore_uop_ids.h
index b950f760d74ac7..927dae88c1fa73 100644
--- a/Include/internal/pycore_uop_ids.h
+++ b/Include/internal/pycore_uop_ids.h
@@ -222,64 +222,65 @@ extern "C" {
 #define _LOAD_SUPER_ATTR_METHOD LOAD_SUPER_ATTR_METHOD
 #define _MAKE_CELL MAKE_CELL
 #define _MAKE_FUNCTION MAKE_FUNCTION
+#define _MAKE_WARM 439
 #define _MAP_ADD MAP_ADD
 #define _MATCH_CLASS MATCH_CLASS
 #define _MATCH_KEYS MATCH_KEYS
 #define _MATCH_MAPPING MATCH_MAPPING
 #define _MATCH_SEQUENCE MATCH_SEQUENCE
-#define _MAYBE_EXPAND_METHOD 439
-#define _MONITOR_CALL 440
-#define _MONITOR_JUMP_BACKWARD 441
-#define _MONITOR_RESUME 442
+#define _MAYBE_EXPAND_METHOD 440
+#define _MONITOR_CALL 441
+#define _MONITOR_JUMP_BACKWARD 442
+#define _MONITOR_RESUME 443
 #define _NOP NOP
 #define _POP_EXCEPT POP_EXCEPT
-#define _POP_JUMP_IF_FALSE 443
-#define _POP_JUMP_IF_TRUE 444
+#define _POP_JUMP_IF_FALSE 444
+#define _POP_JUMP_IF_TRUE 445
 #define _POP_TOP POP_TOP
-#define _POP_TOP_LOAD_CONST_INLINE_BORROW 445
+#define _POP_TOP_LOAD_CONST_INLINE_BORROW 446
 #define _PUSH_EXC_INFO PUSH_EXC_INFO
-#define _PUSH_FRAME 446
+#define _PUSH_FRAME 447
 #define _PUSH_NULL PUSH_NULL
-#define _PY_FRAME_GENERAL 447
-#define _PY_FRAME_KW 448
-#define _QUICKEN_RESUME 449
-#define _REPLACE_WITH_TRUE 450
+#define _PY_FRAME_GENERAL 448
+#define _PY_FRAME_KW 449
+#define _QUICKEN_RESUME 450
+#define _REPLACE_WITH_TRUE 451
 #define _RESUME_CHECK RESUME_CHECK
 #define _RETURN_GENERATOR RETURN_GENERATOR
 #define _RETURN_VALUE RETURN_VALUE
-#define _SAVE_RETURN_OFFSET 451
-#define _SEND 452
-#define _SEND_GEN_FRAME 453
+#define _SAVE_RETURN_OFFSET 452
+#define _SEND 453
+#define _SEND_GEN_FRAME 454
 #define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS
 #define _SET_ADD SET_ADD
 #define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE
 #define _SET_UPDATE SET_UPDATE
-#define _START_EXECUTOR 454
-#define _STORE_ATTR 455
-#define _STORE_ATTR_INSTANCE_VALUE 456
-#define _STORE_ATTR_SLOT 457
-#define _STORE_ATTR_WITH_HINT 458
+#define _START_EXECUTOR 455
+#define _STORE_ATTR 456
+#define _STORE_ATTR_INSTANCE_VALUE 457
+#define _STORE_ATTR_SLOT 458
+#define _STORE_ATTR_WITH_HINT 459
 #define _STORE_DEREF STORE_DEREF
-#define _STORE_FAST 459
-#define _STORE_FAST_0 460
-#define _STORE_FAST_1 461
-#define _STORE_FAST_2 462
-#define _STORE_FAST_3 463
-#define _STORE_FAST_4 464
-#define _STORE_FAST_5 465
-#define _STORE_FAST_6 466
-#define _STORE_FAST_7 467
+#define _STORE_FAST 460
+#define _STORE_FAST_0 461
+#define _STORE_FAST_1 462
+#define _STORE_FAST_2 463
+#define _STORE_FAST_3 464
+#define _STORE_FAST_4 465
+#define _STORE_FAST_5 466
+#define _STORE_FAST_6 467
+#define _STORE_FAST_7 468
 #define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST
 #define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST
 #define _STORE_GLOBAL STORE_GLOBAL
 #define _STORE_NAME STORE_NAME
-#define _STORE_SLICE 468
-#define _STORE_SUBSCR 469
+#define _STORE_SLICE 469
+#define _STORE_SUBSCR 470
 #define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT
 #define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT
 #define _SWAP SWAP
-#define _TIER2_RESUME_CHECK 470
-#define _TO_BOOL 471
+#define _TIER2_RESUME_CHECK 471
+#define _TO_BOOL 472
 #define _TO_BOOL_BOOL TO_BOOL_BOOL
 #define _TO_BOOL_INT TO_BOOL_INT
 #define _TO_BOOL_LIST TO_BOOL_LIST
@@ -289,14 +290,14 @@ extern "C" {
 #define _UNARY_NEGATIVE UNARY_NEGATIVE
 #define _UNARY_NOT UNARY_NOT
 #define _UNPACK_EX UNPACK_EX
-#define _UNPACK_SEQUENCE 472
+#define _UNPACK_SEQUENCE 473
 #define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST
 #define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE
 #define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE
 #define _WITH_EXCEPT_START WITH_EXCEPT_START
 #define _YIELD_VALUE YIELD_VALUE
 #define __DO_CALL_FUNCTION_EX _DO_CALL_FUNCTION_EX
-#define MAX_UOP_ID 472
+#define MAX_UOP_ID 473
 
 #ifdef __cplusplus
 }
diff --git a/Include/internal/pycore_uop_metadata.h 
b/Include/internal/pycore_uop_metadata.h
index 4d0ab22e6aa8f3..07606135d7a356 100644
--- a/Include/internal/pycore_uop_metadata.h
+++ b/Include/internal/pycore_uop_metadata.h
@@ -274,6 +274,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = {
     [_INTERNAL_INCREMENT_OPT_COUNTER] = 0,
     [_DYNAMIC_EXIT] = HAS_ESCAPES_FLAG,
     [_START_EXECUTOR] = 0,
+    [_MAKE_WARM] = 0,
     [_FATAL_ERROR] = 0,
     [_CHECK_VALIDITY_AND_SET_IP] = HAS_DEOPT_FLAG,
     [_DEOPT] = 0,
@@ -481,6 +482,7 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = {
     [_LOAD_SUPER_ATTR_METHOD] = "_LOAD_SUPER_ATTR_METHOD",
     [_MAKE_CELL] = "_MAKE_CELL",
     [_MAKE_FUNCTION] = "_MAKE_FUNCTION",
+    [_MAKE_WARM] = "_MAKE_WARM",
     [_MAP_ADD] = "_MAP_ADD",
     [_MATCH_CLASS] = "_MATCH_CLASS",
     [_MATCH_KEYS] = "_MATCH_KEYS",
@@ -1062,6 +1064,8 @@ int _PyUop_num_popped(int opcode, int oparg)
             return 0;
         case _START_EXECUTOR:
             return 0;
+        case _MAKE_WARM:
+            return 0;
         case _FATAL_ERROR:
             return 0;
         case _CHECK_VALIDITY_AND_SET_IP:
diff --git 
a/Misc/NEWS.d/next/Core_and_Builtins/2024-08-27-21-44-14.gh-issue-116017.ZY3yBY.rst
 
b/Misc/NEWS.d/next/Core_and_Builtins/2024-08-27-21-44-14.gh-issue-116017.ZY3yBY.rst
new file mode 100644
index 00000000000000..de62875e16475d
--- /dev/null
+++ 
b/Misc/NEWS.d/next/Core_and_Builtins/2024-08-27-21-44-14.gh-issue-116017.ZY3yBY.rst
@@ -0,0 +1,2 @@
+Improved JIT memory consumption by periodically freeing memory used by 
infrequently-executed code.
+This change is especially likely to improve the memory footprint of 
long-running programs.
diff --git a/Python/bytecodes.c b/Python/bytecodes.c
index 0fd396f1319e78..8535306d9c7a03 100644
--- a/Python/bytecodes.c
+++ b/Python/bytecodes.c
@@ -4836,6 +4836,14 @@ dummy_func(
             assert(((_PyExecutorObject *)executor)->vm_data.valid);
         }
 
+        tier2 op(_MAKE_WARM, (--)) {
+            current_executor->vm_data.warm = true;
+            // It's okay if this ends up going negative.
+            if (--tstate->interp->trace_run_counter == 0) {
+                _Py_set_eval_breaker_bit(tstate, 
_PY_EVAL_JIT_INVALIDATE_COLD_BIT);
+            }
+        }
+
         tier2 op(_FATAL_ERROR, (--)) {
             assert(0);
             Py_FatalError("Fatal error uop executed.");
diff --git a/Python/ceval_gil.c b/Python/ceval_gil.c
index 6f4476d055b5ec..1d9381d09dfb62 100644
--- a/Python/ceval_gil.c
+++ b/Python/ceval_gil.c
@@ -1289,6 +1289,12 @@ _Py_HandlePending(PyThreadState *tstate)
         _Py_RunGC(tstate);
     }
 
+    if ((breaker & _PY_EVAL_JIT_INVALIDATE_COLD_BIT) != 0) {
+        _Py_unset_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT);
+        _Py_Executors_InvalidateCold(tstate->interp);
+        tstate->interp->trace_run_counter = JIT_CLEANUP_THRESHOLD;
+    }
+
     /* GIL drop request */
     if ((breaker & _PY_GIL_DROP_REQUEST_BIT) != 0) {
         /* Give another thread a chance */
diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h
index 7a9c6ab89c38cc..650bf4533a3a86 100644
--- a/Python/executor_cases.c.h
+++ b/Python/executor_cases.c.h
@@ -5435,6 +5435,15 @@
             break;
         }
 
+        case _MAKE_WARM: {
+            current_executor->vm_data.warm = true;
+            // It's okay if this ends up going negative.
+            if (--tstate->interp->trace_run_counter == 0) {
+                _Py_set_eval_breaker_bit(tstate, 
_PY_EVAL_JIT_INVALIDATE_COLD_BIT);
+            }
+            break;
+        }
+
         case _FATAL_ERROR: {
             assert(0);
             Py_FatalError("Fatal error uop executed.");
diff --git a/Python/optimizer.c b/Python/optimizer.c
index bb7a90b3204f40..978649faa04d45 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -565,6 +565,7 @@ translate_bytecode_to_trace(
             code->co_firstlineno,
             2 * INSTR_IP(initial_instr, code));
     ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)instr, INSTR_IP(instr, code));
+    ADD_TO_TRACE(_MAKE_WARM, 0, 0, 0);
     uint32_t target = 0;
 
     for (;;) {
@@ -1194,6 +1195,9 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int 
length, const _PyBloomFil
     executor->jit_code = NULL;
     executor->jit_side_entry = NULL;
     executor->jit_size = 0;
+    // This is initialized to true so we can prevent the executor
+    // from being immediately detected as cold and invalidated.
+    executor->vm_data.warm = true;
     if (_PyJIT_Compile(executor, executor->trace, length)) {
         Py_DECREF(executor);
         return NULL;
@@ -1659,4 +1663,42 @@ _Py_Executors_InvalidateAll(PyInterpreterState *interp, 
int is_invalidation)
     }
 }
 
+void
+_Py_Executors_InvalidateCold(PyInterpreterState *interp)
+{
+    /* Walk the list of executors */
+    /* TO DO -- Use a tree to avoid traversing as many objects */
+    PyObject *invalidate = PyList_New(0);
+    if (invalidate == NULL) {
+        goto error;
+    }
+
+    /* Clearing an executor can deallocate others, so we need to make a list of
+     * executors to invalidate first */
+    for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
+        assert(exec->vm_data.valid);
+        _PyExecutorObject *next = exec->vm_data.links.next;
+
+        if (!exec->vm_data.warm && PyList_Append(invalidate, (PyObject *)exec) 
< 0) {
+            goto error;
+        }
+        else {
+            exec->vm_data.warm = false;
+        }
+
+        exec = next;
+    }
+    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
+        _PyExecutorObject *exec = (_PyExecutorObject 
*)PyList_GET_ITEM(invalidate, i);
+        executor_clear(exec);
+    }
+    Py_DECREF(invalidate);
+    return;
+error:
+    PyErr_Clear();
+    Py_XDECREF(invalidate);
+    // If we're truly out of memory, wiping out everything is a fine fallback
+    _Py_Executors_InvalidateAll(interp, 0);
+}
+
 #endif /* _Py_TIER2 */
diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h
index a6cfa271ae6758..4d172e3c762704 100644
--- a/Python/optimizer_cases.c.h
+++ b/Python/optimizer_cases.c.h
@@ -2381,6 +2381,10 @@
             break;
         }
 
+        case _MAKE_WARM: {
+            break;
+        }
+
         case _FATAL_ERROR: {
             break;
         }
diff --git a/Python/pystate.c b/Python/pystate.c
index 6bf7ebeb75ff73..6b85e5a64fefcf 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -660,6 +660,7 @@ init_interpreter(PyInterpreterState *interp,
 #ifdef _Py_TIER2
     (void)_Py_SetOptimizer(interp, NULL);
     interp->executor_list_head = NULL;
+    interp->trace_run_counter = JIT_CLEANUP_THRESHOLD;
 #endif
     if (interp != &runtime->_main_interpreter) {
         /* Fix the self-referential, statically initialized fields. */
diff --git a/Tools/cases_generator/analyzer.py 
b/Tools/cases_generator/analyzer.py
index aabe205125856c..a4ce207703edcd 100644
--- a/Tools/cases_generator/analyzer.py
+++ b/Tools/cases_generator/analyzer.py
@@ -540,6 +540,7 @@ def has_error_without_pop(op: parser.InstDef) -> bool:
     "_PyList_FromStackRefSteal",
     "_PyTuple_FromArraySteal",
     "_PyTuple_FromStackRefSteal",
+    "_Py_set_eval_breaker_bit"
 )
 
 ESCAPING_FUNCTIONS = (
diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py
index e37ee943999785..6c7b48f1f37865 100644
--- a/Tools/jit/_targets.py
+++ b/Tools/jit/_targets.py
@@ -139,6 +139,9 @@ async def _compile(
             "-fno-plt",
             # Don't call stack-smashing canaries that we can't find or patch:
             "-fno-stack-protector",
+            # On aarch64 Linux, intrinsics were being emitted and this flag
+            # was required to disable them.
+            "-mno-outline-atomics",
             "-std=c11",
             *self.args,
         ]

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-le...@python.org
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: arch...@mail-archive.com

Reply via email to