Remove __init from ct_cpu_track_user() and __initdata from the
initialized flag so context tracking can be activated on CPUs that
join nohz_full at runtime. Drop the __ro_after_init attribute from
the context_tracking_key static key, allowing static_branch_dec()
when a CPU leaves nohz_full.
Add ct_cpu_untrack_user() to reverse ct_cpu_track_user(), decrementing
the static key and clearing the per-CPU tracking state.
Register a housekeeping_cbs for HK_TYPE_KERNEL_NOISE that:
- pre_validate: checks CONFIG_NO_HZ_FULL is available.
- apply: snapshots the new HK_TYPE_KERNEL_NOISE mask under an RCU
read lock (the lockdep annotation in housekeeping_cpumask() requires
this even after synchronize_rcu() completes), computes nohz_full as
the complement of the housekeeping mask, then under tick_nohz_lock:
- Activates context tracking (ct_cpu_track_user()) on CPUs newly
added to nohz_full, and deactivates it (ct_cpu_untrack_user()) on
CPUs returning to the housekeeping set. This activates the
context_tracking_key static key dynamically, eliminating the
need for CONFIG_CONTEXT_TRACKING_USER_FORCE.
- Updates tick_nohz_full_mask in-place (legacy EXPORT_SYMBOL_GPL
snapshot, eventually consistent).
- Migrates tick_do_timer_cpu if it moved into the isolated set.
- Kicks all CPUs to re-evaluate tick behaviour.
When CONFIG_CONTEXT_TRACKING_USER_FORCE is enabled and nohz_full= is
given at boot, tick_nohz_init() now calls context_tracking_init()
before iterating over tick_nohz_full_mask to call ct_cpu_track_user().
This ensures the per-CPU tracking state is set up before any CPU is
tracked, which is also required for CPUs later added to nohz_full at
runtime via DHM isolated partitions.
Signed-off-by: Jing Wu <[email protected]>
Signed-off-by: Qiliang Yuan <[email protected]>
---
include/linux/context_tracking.h | 1 +
kernel/context_tracking.c | 23 ++----
kernel/time/tick-sched.c | 157 +++++++++++++++++++++++++++++++++++++--
3 files changed, 161 insertions(+), 20 deletions(-)
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index af9fe87a09225..632cfc97b5b22 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -12,6 +12,7 @@
#ifdef CONFIG_CONTEXT_TRACKING_USER
extern void ct_cpu_track_user(int cpu);
+extern void ct_cpu_untrack_user(int cpu);
/* Called with interrupts disabled. */
extern void __ct_user_enter(enum ctx_state state);
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index a743e7ffa6c00..e68fb02b25ad4 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -411,7 +411,7 @@ static __always_inline void ct_kernel_enter(bool user, int
offset) { }
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
-DEFINE_STATIC_KEY_FALSE_RO(context_tracking_key);
+DEFINE_STATIC_KEY_FALSE(context_tracking_key);
EXPORT_SYMBOL_GPL(context_tracking_key);
static noinstr bool context_tracking_recursion_enter(void)
@@ -674,28 +674,21 @@ void user_exit_callable(void)
}
NOKPROBE_SYMBOL(user_exit_callable);
-void __init ct_cpu_track_user(int cpu)
+void ct_cpu_track_user(int cpu)
{
- static __initdata bool initialized = false;
-
if (!per_cpu(context_tracking.active, cpu)) {
per_cpu(context_tracking.active, cpu) = true;
static_branch_inc(&context_tracking_key);
}
+}
- if (initialized)
+void ct_cpu_untrack_user(int cpu)
+{
+ if (!per_cpu(context_tracking.active, cpu))
return;
-#ifdef CONFIG_HAVE_TIF_NOHZ
- /*
- * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork
- * This assumes that init is the only task at this early boot stage.
- */
- set_tsk_thread_flag(&init_task, TIF_NOHZ);
-#endif
- WARN_ON_ONCE(!tasklist_empty());
-
- initialized = true;
+ per_cpu(context_tracking.active, cpu) = false;
+ static_branch_dec(&context_tracking_key);
}
#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cbbb87a0c6e7c..a7fe097042f7d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -26,6 +26,7 @@
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/context_tracking.h>
+#include <linux/sched/isolation.h>
#include <linux/mm.h>
#include <asm/irq_regs.h>
@@ -653,11 +654,6 @@ void __init tick_nohz_init(void)
if (!tick_nohz_full_running)
return;
- /*
- * Full dynticks uses IRQ work to drive the tick rescheduling on safe
- * locking contexts. But then we need IRQ work to raise its own
- * interrupts to avoid circular dependency on the tick.
- */
if (!arch_irq_work_has_interrupt()) {
pr_warn("NO_HZ: Can't run full dynticks because arch doesn't
support IRQ work self-IPIs\n");
cpumask_clear(tick_nohz_full_mask);
@@ -676,6 +672,16 @@ void __init tick_nohz_init(void)
}
}
+ /*
+ * Pre-initialize context tracking for all possible CPUs so
+ * ctx tracking is already active when a CPU is later added to
+ * nohz_full at runtime. The tracking overhead is negligible
+ * because the static key is not incremented yet — only per-CPU
+ * tracking state is set up.
+ */
+ if (IS_ENABLED(CONFIG_CONTEXT_TRACKING_USER_FORCE))
+ context_tracking_init();
+
for_each_cpu(cpu, tick_nohz_full_mask)
ct_cpu_track_user(cpu);
@@ -686,6 +692,147 @@ void __init tick_nohz_init(void)
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
cpumask_pr_args(tick_nohz_full_mask));
}
+
+static int tick_nohz_hk_validate(enum hk_type type,
+ const struct cpumask *cur_mask,
+ const struct cpumask *new_mask)
+{
+ if (!IS_ENABLED(CONFIG_NO_HZ_FULL))
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+static void tick_nohz_hk_apply(enum hk_type type)
+{
+ static DEFINE_SPINLOCK(tick_nohz_lock);
+ cpumask_var_t nohz_full, added, removed;
+ bool was_running;
+ int cpu;
+
+ if (!alloc_cpumask_var(&nohz_full, GFP_KERNEL))
+ return;
+ if (!alloc_cpumask_var(&added, GFP_KERNEL)) {
+ free_cpumask_var(nohz_full);
+ return;
+ }
+ if (!alloc_cpumask_var(&removed, GFP_KERNEL)) {
+ free_cpumask_var(added);
+ free_cpumask_var(nohz_full);
+ return;
+ }
+
+ /*
+ * Snapshot the new HK_TYPE_KERNEL_NOISE mask under an RCU read lock.
+ * housekeeping_update_types() completes synchronize_rcu() before
+ * invoking apply(), so the new pointer is stable; however the lockdep
+ * annotation in housekeeping_cpumask() still requires an RCU read-side
+ * critical section for runtime-mutable types.
+ */
+ rcu_read_lock();
+ cpumask_andnot(nohz_full, cpu_possible_mask,
+ housekeeping_cpumask_rcu(HK_TYPE_KERNEL_NOISE));
+ rcu_read_unlock();
+
+ /*
+ * When "nohz_full=" was not passed at boot, tick_nohz_full_running is
+ * false and the full dynticks infrastructure (sched_tick_offload_init,
+ * RCU nohz quiescent-state reporting, context-tracking bootstrap) was
+ * never initialised. In that case restrict the update to
+ * tick_nohz_full_mask so the /sys/devices/system/cpu/nohz_full sysfs
+ * attribute reflects DHM-isolated CPUs without enabling tick
+ * suppression, context tracking, or timer migration – all of which
+ * require boot-time setup and would deadlock on the first
+ * synchronize_rcu() call after CPUs are offlined.
+ */
+ was_running = READ_ONCE(tick_nohz_full_running);
+
+ spin_lock(&tick_nohz_lock);
+
+ /*
+ * When nohz_full= was active at boot, compute the delta and update
+ * context tracking for CPUs joining or leaving the nohz_full set.
+ * Skip when !was_running: ct_cpu_track_user() calls
+ * static_branch_inc() which may sleep (jump_label_update on the
+ * 0→1 transition) – illegal inside a spinlock.
+ */
+ if (IS_ENABLED(CONFIG_CONTEXT_TRACKING_USER) &&
+ was_running &&
+ cpumask_available(tick_nohz_full_mask)) {
+ cpumask_andnot(added, nohz_full, tick_nohz_full_mask);
+ cpumask_andnot(removed, tick_nohz_full_mask, nohz_full);
+ for_each_cpu(cpu, added)
+ ct_cpu_track_user(cpu);
+ for_each_cpu(cpu, removed)
+ ct_cpu_untrack_user(cpu);
+ }
+
+ /*
+ * Update tick_nohz_full_mask unconditionally: this is the snapshot
+ * read by the /sys/devices/system/cpu/nohz_full sysfs attribute and
+ * must reflect the current isolation set even in the DHM runtime case.
+ */
+ if (cpumask_available(tick_nohz_full_mask))
+ cpumask_copy(tick_nohz_full_mask, nohz_full);
+
+ /*
+ * Only modify tick_nohz_full_running and migrate the global tick when
+ * nohz_full= was set at boot; without boot-time setup, setting
+ * tick_nohz_full_running would suppress ticks on isolated CPUs and
+ * prevent RCU quiescent-state reporting, causing synchronize_rcu()
+ * to stall permanently when a CPU is subsequently offlined.
+ */
+ if (was_running) {
+ tick_nohz_full_running = !cpumask_empty(nohz_full);
+
+ if (tick_nohz_full_running) {
+ cpu = READ_ONCE(tick_do_timer_cpu);
+ if (cpu < nr_cpu_ids &&
+ !housekeeping_test_cpu(cpu, HK_TYPE_KERNEL_NOISE)) {
+ int new_cpu;
+
+ new_cpu =
housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
+ if (new_cpu < nr_cpu_ids)
+ WRITE_ONCE(tick_do_timer_cpu, new_cpu);
+ }
+ }
+ }
+
+ spin_unlock(&tick_nohz_lock);
+
+ if (was_running)
+ tick_nohz_full_kick_all();
+ free_cpumask_var(removed);
+ free_cpumask_var(added);
+ free_cpumask_var(nohz_full);
+}
+
+static struct housekeeping_cbs tick_nohz_hk_cbs = {
+ .name = "tick/nohz",
+ .pre_validate = tick_nohz_hk_validate,
+ .apply = tick_nohz_hk_apply,
+};
+
+static int __init tick_nohz_hk_init_late(void)
+{
+ int ret;
+
+ /*
+ * Ensure tick_nohz_full_mask is allocated so that tick_nohz_hk_apply()
+ * can update it (and the /sys/devices/system/cpu/nohz_full sysfs
+ * attribute) when CPUs are isolated at runtime via DHM. If
"nohz_full="
+ * was passed at boot the mask is already allocated; allocate an empty
+ * one here for the runtime-only case.
+ */
+ if (!cpumask_available(tick_nohz_full_mask) &&
+ !zalloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL))
+ pr_warn("tick/nohz: failed to allocate nohz_full_mask for
DHM\n");
+
+ ret = housekeeping_register_cbs(HK_TYPE_KERNEL_NOISE,
&tick_nohz_hk_cbs);
+ if (ret)
+ pr_warn("tick/nohz: Failed to register hk callback: %d\n", ret);
+ return 0;
+}
+late_initcall(tick_nohz_hk_init_late);
#endif /* #ifdef CONFIG_NO_HZ_FULL */
/*
--
2.43.0