When CONFIG_RCU_PER_CPU_BLOCKED_LISTS is enabled, tasks that block in
RCU read-side critical sections may be placed on per-CPU lists rather
than directly on the rcu_node's blkd_tasks list.  It is possible that a
task can block just after rcu_gp_init()'s promotion scan completes,
leaving it only on the per-CPU list while a GP is active.

The RCU priority boosting mechanism only looks at rnp->gp_tasks and
rnp->exp_tasks, which point into rnp->blkd_tasks.  Tasks on per-CPU
lists are invisible to the boost kthread and cannot be boosted.

Address this by adding a "promote" parameter to
rcu_preempt_blocked_readers_cgp().  When promote is true and the caller
the function first promotes any tasks from per-CPU blocked lists to the
rcu_node's blkd_tasks list before checking if there are blocked readers.
This ensures that late-arriving tasks are visible for priority boosting
and other operations.

Callers that hold the rnp lock pass promote=true to get an accurate answer
including late arrivals. Lockless callers (GP loop, FQS check) pass
promote=false for an approximate snapshot (TODO: need to check if we can
always just set "promote" to true and remove the parameter).

Signed-off-by: Joel Fernandes <[email protected]>
---
 kernel/rcu/tree.c        | 14 +++++++-------
 kernel/rcu/tree.h        |  2 +-
 kernel/rcu/tree_plugin.h | 34 ++++++++++++++++++++++++++++------
 kernel/rcu/tree_stall.h  |  4 ++--
 4 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5837e9923642..f8f43f94adbb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2034,7 +2034,7 @@ static bool rcu_gp_fqs_check_wake(int *gfp)
                return true;
 
        // The current grace period has completed.
-       if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
+       if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp, 
false))
                return true;
 
        return false;
@@ -2125,7 +2125,7 @@ static noinline_for_stack void rcu_gp_fqs_loop(void)
                 * the corresponding leaf nodes have passed through their 
quiescent state.
                 */
                if (!READ_ONCE(rnp->qsmask) &&
-                   !rcu_preempt_blocked_readers_cgp(rnp))
+                   !rcu_preempt_blocked_readers_cgp(rnp, false))
                        break;
                /* If time for quiescent-state forcing, do it. */
                if (!time_after(rcu_state.jiffies_force_qs, jiffies) ||
@@ -2207,7 +2207,7 @@ static noinline void rcu_gp_cleanup(void)
        rcu_seq_end(&new_gp_seq);
        rcu_for_each_node_breadth_first(rnp) {
                raw_spin_lock_irq_rcu_node(rnp);
-               if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
+               if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp, true)))
                        dump_blkd_tasks(rnp, 10);
                WARN_ON_ONCE(rnp->qsmask);
                WRITE_ONCE(rnp->gp_seq, new_gp_seq);
@@ -2376,13 +2376,13 @@ static void rcu_report_qs_rnp(unsigned long mask, 
struct rcu_node *rnp,
                }
                WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
                WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
-                            rcu_preempt_blocked_readers_cgp(rnp));
+                            rcu_preempt_blocked_readers_cgp(rnp, true));
                WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask);
                trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq,
                                                 mask, rnp->qsmask, rnp->level,
                                                 rnp->grplo, rnp->grphi,
                                                 !!rnp->gp_tasks);
-               if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+               if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp, 
true)) {
 
                        /* Other bits still set at this level, so done. */
                        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2428,7 +2428,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned 
long flags)
 
        raw_lockdep_assert_held_rcu_node(rnp);
        if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) ||
-           WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
+           WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp, true)) ||
            rnp->qsmask != 0) {
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return;  /* Still need more quiescent states! */
@@ -2763,7 +2763,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
                rcu_state.cbovldnext |= !!rnp->cbovldmask;
                if (rnp->qsmask == 0) {
-                       if (rcu_preempt_blocked_readers_cgp(rnp)) {
+                       if (rcu_preempt_blocked_readers_cgp(rnp, true)) {
                                /*
                                 * No point in scanning bits because they
                                 * are all zero.  But we might need to
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b71c6c1de8d3..25eb9200e6ef 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -486,7 +486,7 @@ static const char *tp_rcu_varname __used 
__tracepoint_string = rcu_name;
 /* Forward declarations for tree_plugin.h */
 static void rcu_bootup_announce(void);
 static void rcu_qs(void);
-static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp, bool promote);
 #ifdef CONFIG_HOTPLUG_CPU
 static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ad33fdd0efe8..6ed3815bb912 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -383,9 +383,28 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  * Check for preempted RCU readers blocking the current grace period
  * for the specified rcu_node structure.  If the caller needs a reliable
  * answer, it must hold the rcu_node's ->lock.
+ *
+ * If @promote is true and CONFIG_RCU_PER_CPU_BLOCKED_LISTS is enabled,
+ * this function first promotes any tasks from per-CPU blocked lists to
+ * the rcu_node's blkd_tasks list before checking.  This ensures that
+ * late-arriving tasks (blocked after GP init's promotion scan) are
+ * visible for priority boosting and other operations.  When promoting,
+ * the caller must hold rnp->lock.
  */
-static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp, bool promote)
 {
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+       if (promote && rcu_is_leaf_node(rnp)) {
+               int cpu;
+               struct rcu_data *rdp;
+
+               raw_lockdep_assert_held_rcu_node(rnp);
+               for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
+                       rdp = per_cpu_ptr(&rcu_data, cpu);
+                       rcu_promote_blocked_tasks_rdp(rdp, rnp);
+               }
+       }
+#endif
        return READ_ONCE(rnp->gp_tasks) != NULL;
 }
 
@@ -570,7 +589,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, 
unsigned long flags)
                raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
                WARN_ON_ONCE(rnp != t->rcu_blocked_node);
                WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
-               empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
+               empty_norm = !rcu_preempt_blocked_readers_cgp(rnp, true);
                WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
                             (!empty_norm || rnp->qsmask));
                empty_exp = sync_rcu_exp_done(rnp);
@@ -597,7 +616,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, 
unsigned long flags)
                 * so we must take a snapshot of the expedited state.
                 */
                empty_exp_now = sync_rcu_exp_done(rnp);
-               if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
+               if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp, true)) 
{
                        trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
                                                         rnp->gp_seq,
                                                         0, rnp->qsmask,
@@ -901,7 +920,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node 
*rnp)
 
        RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() 
invoked with preemption enabled!!!\n");
        raw_lockdep_assert_held_rcu_node(rnp);
-       if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
+       if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp, true)))
                dump_blkd_tasks(rnp, 10);
        if (rcu_preempt_has_tasks(rnp) &&
            (rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
@@ -1127,7 +1146,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  * Because preemptible RCU does not exist, there are never any preempted
  * RCU readers.
  */
-static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp, bool promote)
 {
        return 0;
 }
@@ -1221,6 +1240,9 @@ static void rcu_preempt_deferred_qs_init(struct rcu_data 
*rdp) { }
 
 static void rcu_promote_blocked_tasks(struct rcu_node *rnp) { }
 
+static void rcu_promote_blocked_tasks_rdp(struct rcu_data *rdp,
+                                         struct rcu_node *rnp) { }
+
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 /*
@@ -1378,7 +1400,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, 
unsigned long flags)
 {
        raw_lockdep_assert_held_rcu_node(rnp);
        if (!rnp->boost_kthread_task ||
-           (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) {
+           (!rcu_preempt_blocked_readers_cgp(rnp, true) && !rnp->exp_tasks)) {
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return;
        }
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b67532cb8770..5aa65130ab5c 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -277,7 +277,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node 
*rnp)
        struct task_struct *t;
 
        raw_spin_lock_irqsave_rcu_node(rnp, flags);
-       if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+       if (!rcu_preempt_blocked_readers_cgp(rnp, true)) {
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return;
        }
@@ -331,7 +331,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, 
unsigned long flags)
        struct task_struct *ts[8];
 
        lockdep_assert_irqs_disabled();
-       if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+       if (!rcu_preempt_blocked_readers_cgp(rnp, true)) {
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return 0;
        }
-- 
2.34.1


Reply via email to