Integrate with the scheduler to migrate per-CPU slots to the backup slot on context switch. This ensures that the per-CPU slots won't be used by blocked or preempted tasks holding on hazard pointers for a long time.
Signed-off-by: Mathieu Desnoyers <[email protected]> Cc: Nicholas Piggin <[email protected]> Cc: Michael Ellerman <[email protected]> Cc: Greg Kroah-Hartman <[email protected]> Cc: Sebastian Andrzej Siewior <[email protected]> Cc: "Paul E. McKenney" <[email protected]> Cc: Will Deacon <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Boqun Feng <[email protected]> Cc: Alan Stern <[email protected]> Cc: John Stultz <[email protected]> Cc: Neeraj Upadhyay <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Andrew Morton <[email protected]> Cc: Boqun Feng <[email protected]> Cc: Frederic Weisbecker <[email protected]> Cc: Joel Fernandes <[email protected]> Cc: Josh Triplett <[email protected]> Cc: Uladzislau Rezki <[email protected]> Cc: Steven Rostedt <[email protected]> Cc: Lai Jiangshan <[email protected]> Cc: Zqiang <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Waiman Long <[email protected]> Cc: Mark Rutland <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: [email protected] Cc: Mateusz Guzik <[email protected]> Cc: Jonas Oberhauser <[email protected]> Cc: [email protected] Cc: [email protected] Cc: [email protected] --- include/linux/hazptr.h | 63 ++++++++++++++++++++++++++++++++++++++++-- include/linux/sched.h | 4 +++ init/init_task.c | 3 ++ kernel/Kconfig.preempt | 10 +++++++ kernel/fork.c | 3 ++ kernel/sched/core.c | 2 ++ 6 files changed, 83 insertions(+), 2 deletions(-) diff --git a/include/linux/hazptr.h b/include/linux/hazptr.h index 70c066ddb0f5..10ac53a42a7a 100644 --- a/include/linux/hazptr.h +++ b/include/linux/hazptr.h @@ -24,6 +24,7 @@ #include <linux/percpu.h> #include <linux/types.h> #include <linux/cleanup.h> +#include <linux/sched.h> /* 8 slots (each sizeof(void *)) fit in a single cache line. */ #define NR_HAZPTR_PERCPU_SLOTS 8 @@ -46,6 +47,9 @@ struct hazptr_ctx { struct hazptr_slot *slot; /* Backup slot in case all per-CPU slots are used. */ struct hazptr_backup_slot backup_slot; +#ifdef CONFIG_PREEMPT_HAZPTR + struct list_head preempt_node; +#endif }; struct hazptr_percpu_slots { @@ -98,6 +102,50 @@ bool hazptr_slot_is_backup(struct hazptr_ctx *ctx, struct hazptr_slot *slot) return slot == &ctx->backup_slot.slot; } +#ifdef CONFIG_PREEMPT_HAZPTR +static inline +void hazptr_chain_task_ctx(struct hazptr_ctx *ctx) +{ + list_add(&ctx->preempt_node, ¤t->hazptr_ctx_list); +} + +static inline +void hazptr_unchain_task_ctx(struct hazptr_ctx *ctx) +{ + list_del(&ctx->preempt_node); +} + +static inline +void hazptr_note_context_switch(void) +{ + struct hazptr_ctx *ctx; + + list_for_each_entry(ctx, ¤t->hazptr_ctx_list, preempt_node) { + struct hazptr_slot *slot; + + if (hazptr_slot_is_backup(ctx, ctx->slot)) + continue; + slot = hazptr_chain_backup_slot(ctx); + /* + * Move hazard pointer from per-CPU slot to backup slot. + * This requires hazard pointer synchronize to iterate + * on per-CPU slots with load-acquire before iterating + * on the overflow list. + */ + WRITE_ONCE(slot->addr, ctx->slot->addr); + /* + * store-release orders store to backup slot addr before + * store to per-CPU slot addr. + */ + smp_store_release(&ctx->slot->addr, NULL); + } +} +#else +static inline void hazptr_chain_task_ctx(struct hazptr_ctx *ctx) { } +static inline void hazptr_unchain_task_ctx(struct hazptr_ctx *ctx) { } +static inline void hazptr_note_context_switch(void) { } +#endif + /* * hazptr_acquire: Load pointer at address and protect with hazard pointer. * @@ -114,6 +162,7 @@ void *hazptr_acquire(struct hazptr_ctx *ctx, void * const * addr_p) struct hazptr_slot *slot = NULL; void *addr, *addr2; + ctx->slot = NULL; /* * Load @addr_p to know which address should be protected. */ @@ -121,7 +170,9 @@ void *hazptr_acquire(struct hazptr_ctx *ctx, void * const * addr_p) for (;;) { if (!addr) return NULL; + guard(preempt)(); + hazptr_chain_task_ctx(ctx); if (likely(!hazptr_slot_is_backup(ctx, slot))) { slot = hazptr_get_free_percpu_slot(); /* @@ -140,8 +191,11 @@ void *hazptr_acquire(struct hazptr_ctx *ctx, void * const * addr_p) * Re-load @addr_p after storing it to the hazard pointer slot. */ addr2 = READ_ONCE(*addr_p); /* Load A */ - if (likely(ptr_eq(addr2, addr))) + if (likely(ptr_eq(addr2, addr))) { + ctx->slot = slot; + /* Success. Break loop, enable preemption and return. */ break; + } /* * If @addr_p content has changed since the first load, * release the hazard pointer and try again. @@ -150,11 +204,14 @@ void *hazptr_acquire(struct hazptr_ctx *ctx, void * const * addr_p) if (!addr2) { if (hazptr_slot_is_backup(ctx, slot)) hazptr_unchain_backup_slot(ctx); + hazptr_unchain_task_ctx(ctx); + /* Loaded NULL. Enable preemption and return NULL. */ return NULL; } addr = addr2; + hazptr_unchain_task_ctx(ctx); + /* Enable preemption and retry. */ } - ctx->slot = slot; /* * Use addr2 loaded from the second READ_ONCE() to preserve * address dependency ordering. @@ -170,11 +227,13 @@ void hazptr_release(struct hazptr_ctx *ctx, void *addr) if (!addr) return; + guard(preempt)(); slot = ctx->slot; WARN_ON_ONCE(slot->addr != addr); smp_store_release(&slot->addr, NULL); if (unlikely(hazptr_slot_is_backup(ctx, slot))) hazptr_unchain_backup_slot(ctx); + hazptr_unchain_task_ctx(ctx); } void hazptr_init(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index b469878de25c..bbec9fd6b163 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -933,6 +933,10 @@ struct task_struct { struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_PREEMPT_HAZPTR + struct list_head hazptr_ctx_list; +#endif + #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; u8 rcu_tasks_holdout; diff --git a/init/init_task.c b/init/init_task.c index a55e2189206f..117aebf5573a 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -160,6 +160,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), .rcu_blocked_node = NULL, #endif +#ifdef CONFIG_PREEMPT_HAZPTR + .hazptr_ctx_list = LIST_HEAD_INIT(init_task.hazptr_ctx_list), +#endif #ifdef CONFIG_TASKS_RCU .rcu_tasks_holdout = false, .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index da326800c1c9..beb351b42b7c 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -189,3 +189,13 @@ config SCHED_CLASS_EXT For more information: Documentation/scheduler/sched-ext.rst https://github.com/sched-ext/scx + +config PREEMPT_HAZPTR + bool "Move Hazard Pointers to Task Slots on Context Switch" + help + Integrate hazard pointers with the scheduler so the active + hazard pointers using preallocated per-CPU slots are moved to + their context local slot on context switch. This prevents + blocked or preempted tasks to hold on to per-CPU slots for + a long time, which would cause higher overhead for short + hazard pointer critical sections. diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..35c810fe744e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1780,6 +1780,9 @@ static inline void rcu_copy_process(struct task_struct *p) p->rcu_blocked_node = NULL; INIT_LIST_HEAD(&p->rcu_node_entry); #endif /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_PREEMPT_HAZPTR + INIT_LIST_HEAD(&p->hazptr_ctx_list); +#endif /* #ifdef CONFIG_PREEMPT_HAZPTR */ #ifdef CONFIG_TASKS_RCU p->rcu_tasks_holdout = false; INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f754a60de848..ac8bf2708140 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -60,6 +60,7 @@ #include <linux/profile.h> #include <linux/psi.h> #include <linux/rcuwait_api.h> +#include <linux/hazptr.h> #include <linux/rseq.h> #include <linux/sched/wake_q.h> #include <linux/scs.h> @@ -6812,6 +6813,7 @@ static void __sched notrace __schedule(int sched_mode) local_irq_disable(); rcu_note_context_switch(preempt); + hazptr_note_context_switch(); /* * Make sure that signal_pending_state()->signal_pending() below -- 2.39.5

