When CONFIG_RCU_PER_CPU_BLOCKED_LISTS is enabled, tasks that block in RCU read-side critical sections may be placed on per-CPU lists rather than directly on the rcu_node's blkd_tasks list. It is possible that a task can block just after rcu_gp_init()'s promotion scan completes, leaving it only on the per-CPU list while a GP is active.
The RCU priority boosting mechanism only looks at rnp->gp_tasks and rnp->exp_tasks, which point into rnp->blkd_tasks. Tasks on per-CPU lists are invisible to the boost kthread and cannot be boosted. Address this by adding a "promote" parameter to rcu_preempt_blocked_readers_cgp(). When promote is true and the caller the function first promotes any tasks from per-CPU blocked lists to the rcu_node's blkd_tasks list before checking if there are blocked readers. This ensures that late-arriving tasks are visible for priority boosting and other operations. Callers that hold the rnp lock pass promote=true to get an accurate answer including late arrivals. Lockless callers (GP loop, FQS check) pass promote=false for an approximate snapshot (TODO: need to check if we can always just set "promote" to true and remove the parameter). Signed-off-by: Joel Fernandes <[email protected]> --- kernel/rcu/tree.c | 14 +++++++------- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 34 ++++++++++++++++++++++++++++------ kernel/rcu/tree_stall.h | 4 ++-- 4 files changed, 38 insertions(+), 16 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5837e9923642..f8f43f94adbb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2034,7 +2034,7 @@ static bool rcu_gp_fqs_check_wake(int *gfp) return true; // The current grace period has completed. - if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) + if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp, false)) return true; return false; @@ -2125,7 +2125,7 @@ static noinline_for_stack void rcu_gp_fqs_loop(void) * the corresponding leaf nodes have passed through their quiescent state. */ if (!READ_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)) + !rcu_preempt_blocked_readers_cgp(rnp, false)) break; /* If time for quiescent-state forcing, do it. */ if (!time_after(rcu_state.jiffies_force_qs, jiffies) || @@ -2207,7 +2207,7 @@ static noinline void rcu_gp_cleanup(void) rcu_seq_end(&new_gp_seq); rcu_for_each_node_breadth_first(rnp) { raw_spin_lock_irq_rcu_node(rnp); - if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) + if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp, true))) dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->gp_seq, new_gp_seq); @@ -2376,13 +2376,13 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && - rcu_preempt_blocked_readers_cgp(rnp)); + rcu_preempt_blocked_readers_cgp(rnp, true)); WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask); trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq, mask, rnp->qsmask, rnp->level, rnp->grplo, rnp->grphi, !!rnp->gp_tasks); - if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp, true)) { /* Other bits still set at this level, so done. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2428,7 +2428,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) raw_lockdep_assert_held_rcu_node(rnp); if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) || - WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp, true)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; /* Still need more quiescent states! */ @@ -2763,7 +2763,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { - if (rcu_preempt_blocked_readers_cgp(rnp)) { + if (rcu_preempt_blocked_readers_cgp(rnp, true)) { /* * No point in scanning bits because they * are all zero. But we might need to diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index b71c6c1de8d3..25eb9200e6ef 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -486,7 +486,7 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; /* Forward declarations for tree_plugin.h */ static void rcu_bootup_announce(void); static void rcu_qs(void); -static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp, bool promote); #ifdef CONFIG_HOTPLUG_CPU static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ad33fdd0efe8..6ed3815bb912 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -383,9 +383,28 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); * Check for preempted RCU readers blocking the current grace period * for the specified rcu_node structure. If the caller needs a reliable * answer, it must hold the rcu_node's ->lock. + * + * If @promote is true and CONFIG_RCU_PER_CPU_BLOCKED_LISTS is enabled, + * this function first promotes any tasks from per-CPU blocked lists to + * the rcu_node's blkd_tasks list before checking. This ensures that + * late-arriving tasks (blocked after GP init's promotion scan) are + * visible for priority boosting and other operations. When promoting, + * the caller must hold rnp->lock. */ -static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp, bool promote) { +#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS + if (promote && rcu_is_leaf_node(rnp)) { + int cpu; + struct rcu_data *rdp; + + raw_lockdep_assert_held_rcu_node(rnp); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + rdp = per_cpu_ptr(&rcu_data, cpu); + rcu_promote_blocked_tasks_rdp(rdp, rnp); + } + } +#endif return READ_ONCE(rnp->gp_tasks) != NULL; } @@ -570,7 +589,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ WARN_ON_ONCE(rnp != t->rcu_blocked_node); WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); - empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); + empty_norm = !rcu_preempt_blocked_readers_cgp(rnp, true); WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); empty_exp = sync_rcu_exp_done(rnp); @@ -597,7 +616,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * so we must take a snapshot of the expedited state. */ empty_exp_now = sync_rcu_exp_done(rnp); - if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { + if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp, true)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gp_seq, 0, rnp->qsmask, @@ -901,7 +920,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) + if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp, true))) dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp) && (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { @@ -1127,7 +1146,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); * Because preemptible RCU does not exist, there are never any preempted * RCU readers. */ -static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp, bool promote) { return 0; } @@ -1221,6 +1240,9 @@ static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { } static void rcu_promote_blocked_tasks(struct rcu_node *rnp) { } +static void rcu_promote_blocked_tasks_rdp(struct rcu_data *rdp, + struct rcu_node *rnp) { } + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* @@ -1378,7 +1400,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { raw_lockdep_assert_held_rcu_node(rnp); if (!rnp->boost_kthread_task || - (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) { + (!rcu_preempt_blocked_readers_cgp(rnp, true) && !rnp->exp_tasks)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b67532cb8770..5aa65130ab5c 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -277,7 +277,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) struct task_struct *t; raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!rcu_preempt_blocked_readers_cgp(rnp)) { + if (!rcu_preempt_blocked_readers_cgp(rnp, true)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -331,7 +331,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) struct task_struct *ts[8]; lockdep_assert_irqs_disabled(); - if (!rcu_preempt_blocked_readers_cgp(rnp)) { + if (!rcu_preempt_blocked_readers_cgp(rnp, true)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; } -- 2.34.1

