On Wed, Oct 07, 2015 at 01:01:20PM +0200, Peter Zijlstra wrote:

> That again doesn't explain which UNLOCKs with non-matching lock values
> it pairs with and what particular ordering is important here.

So after staring at that stuff for a while I came up with the following.
Does this make sense, or am I completely misunderstanding things?

Not been near a compiler.

---
 kernel/rcu/tree.c | 99 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 61 insertions(+), 38 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 775d36cc0050..46e1e23ff762 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1460,6 +1460,48 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, 
struct rcu_data *rdp,
 }
 
 /*
+ * Wrappers for the rcu_node::lock acquire.
+ *
+ * Because the rcu_nodes form a tree, the tree taversal locking will observe
+ * different lock values, this in turn means that an UNLOCK of one level
+ * followed by a LOCK of another level does not imply a full memory barrier;
+ * and most importantly transitivity is lost.
+ *
+ * In order to restore full ordering between tree levels, augment the regular
+ * lock acquire functions with smp_mb__after_unlock_lock().
+ */
+static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
+{
+       raw_spin_lock(&rnp->lock);
+       smp_mb__after_unlock_lock();
+}
+
+static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
+{
+       raw_spin_lock_irq(&rnp->lock);
+       smp_mb__after_unlock_lock();
+}
+
+static inline void
+_raw_spin_lock_irqsave_rcu_node(struct rcu_node *rnp, unsigned long *flags)
+{
+       _raw_spin_lock_irqsave(&rnp->lock, flags);
+       smp_mb__after_unlock_lock();
+}
+
+#define raw_spin_lock_irqsave_rcu_node(rnp, flags)
+       _raw_spin_lock_irqsave_rcu_node((rnp), &(flags))
+
+static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
+{
+       bool locked = raw_spin_trylock(&rnp->lock);
+       if (locked)
+               smp_mb__after_unlock_lock();
+       return locked;
+}
+
+
+/*
  * Start some future grace period, as needed to handle newly arrived
  * callbacks.  The required future grace periods are recorded in each
  * rcu_node structure's ->need_future_gp field.  Returns true if there
@@ -1512,10 +1554,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct 
rcu_data *rdp,
         * hold it, acquire the root rcu_node structure's lock in order to
         * start one (if needed).
         */
-       if (rnp != rnp_root) {
-               raw_spin_lock(&rnp_root->lock);
-               smp_mb__after_unlock_lock();
-       }
+       if (rnp != rnp_root)
+               raw_spin_lock_rcu_node(rnp);
 
        /*
         * Get a new grace-period number.  If there really is no grace
@@ -1764,11 +1804,10 @@ static void note_gp_changes(struct rcu_state *rsp, 
struct rcu_data *rdp)
        if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
             rdp->completed == READ_ONCE(rnp->completed) &&
             !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
-           !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
+           !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
                local_irq_restore(flags);
                return;
        }
-       smp_mb__after_unlock_lock();
        needwake = __note_gp_changes(rsp, rnp, rdp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (needwake)
@@ -1792,8 +1831,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
        struct rcu_node *rnp = rcu_get_root(rsp);
 
        WRITE_ONCE(rsp->gp_activity, jiffies);
-       raw_spin_lock_irq(&rnp->lock);
-       smp_mb__after_unlock_lock();
+       raw_spin_lock_irq_rcu_node(rnp);
        if (!READ_ONCE(rsp->gp_flags)) {
                /* Spurious wakeup, tell caller to go back to sleep.  */
                raw_spin_unlock_irq(&rnp->lock);
@@ -1825,8 +1863,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
         */
        rcu_for_each_leaf_node(rsp, rnp) {
                rcu_gp_slow(rsp, gp_preinit_delay);
-               raw_spin_lock_irq(&rnp->lock);
-               smp_mb__after_unlock_lock();
+               raw_spin_lock_irq_rcu_node(rnp);
                if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
                    !rnp->wait_blkd_tasks) {
                        /* Nothing to do on this leaf rcu_node structure. */
@@ -1882,8 +1919,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
         */
        rcu_for_each_node_breadth_first(rsp, rnp) {
                rcu_gp_slow(rsp, gp_init_delay);
-               raw_spin_lock_irq(&rnp->lock);
-               smp_mb__after_unlock_lock();
+               raw_spin_lock_irq_rcu_node(rnp);
                rdp = this_cpu_ptr(rsp->rda);
                rcu_preempt_check_blocked_tasks(rnp);
                rnp->qsmask = rnp->qsmaskinit;
@@ -1953,8 +1989,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int 
fqs_state_in)
        }
        /* Clear flag to prevent immediate re-entry. */
        if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
-               raw_spin_lock_irq(&rnp->lock);
-               smp_mb__after_unlock_lock();
+               raw_spin_lock_irq_rcu_node(rnp);
                WRITE_ONCE(rsp->gp_flags,
                           READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
                raw_spin_unlock_irq(&rnp->lock);
@@ -1974,8 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        struct rcu_node *rnp = rcu_get_root(rsp);
 
        WRITE_ONCE(rsp->gp_activity, jiffies);
-       raw_spin_lock_irq(&rnp->lock);
-       smp_mb__after_unlock_lock();
+       raw_spin_lock_irq_rcu_node(rnp);
        gp_duration = jiffies - rsp->gp_start;
        if (gp_duration > rsp->gp_max)
                rsp->gp_max = gp_duration;
@@ -2000,8 +2034,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
         * grace period is recorded in any of the rcu_node structures.
         */
        rcu_for_each_node_breadth_first(rsp, rnp) {
-               raw_spin_lock_irq(&rnp->lock);
-               smp_mb__after_unlock_lock();
+               raw_spin_lock_irq_rcu_node(rnp);
                WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
                WARN_ON_ONCE(rnp->qsmask);
                WRITE_ONCE(rnp->completed, rsp->gpnum);
@@ -2016,8 +2049,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                rcu_gp_slow(rsp, gp_cleanup_delay);
        }
        rnp = rcu_get_root(rsp);
-       raw_spin_lock_irq(&rnp->lock);
-       smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
+       raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. 
*/
        rcu_nocb_gp_set(rnp, nocb);
 
        /* Declare grace period done. */
@@ -2264,8 +2296,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state 
*rsp,
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                rnp_c = rnp;
                rnp = rnp->parent;
-               raw_spin_lock_irqsave(&rnp->lock, flags);
-               smp_mb__after_unlock_lock();
+               raw_spin_lock_irqsave_rcu_node(rnp, flags);
                oldmask = rnp_c->qsmask;
        }
 
@@ -2312,8 +2343,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state 
*rsp,
        gps = rnp->gpnum;
        mask = rnp->grpmask;
        raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
-       raw_spin_lock(&rnp_p->lock);    /* irqs already disabled. */
-       smp_mb__after_unlock_lock();
+       raw_spin_lock_rcu_node(rnp);    /* irqs already disabled. */
        rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
 }
 
@@ -2335,8 +2365,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct 
rcu_data *rdp)
        struct rcu_node *rnp;
 
        rnp = rdp->mynode;
-       raw_spin_lock_irqsave(&rnp->lock, flags);
-       smp_mb__after_unlock_lock();
+       raw_spin_lock_irqsave_rcu_node(rnp, flags);
        if ((rdp->passed_quiesce == 0 &&
             rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
            rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
@@ -2562,8 +2591,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node 
*rnp_leaf)
                rnp = rnp->parent;
                if (!rnp)
                        break;
-               raw_spin_lock(&rnp->lock); /* irqs already disabled. */
-               smp_mb__after_unlock_lock(); /* GP memory ordering. */
+               raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
                rnp->qsmaskinit &= ~mask;
                rnp->qsmask &= ~mask;
                if (rnp->qsmaskinit) {
@@ -2591,8 +2619,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct 
rcu_state *rsp)
 
        /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
        mask = rdp->grpmask;
-       raw_spin_lock_irqsave(&rnp->lock, flags);
-       smp_mb__after_unlock_lock();    /* Enforce GP memory-order guarantee. */
+       raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order 
guarantee. */
        rnp->qsmaskinitnext &= ~mask;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -2789,8 +2816,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
        rcu_for_each_leaf_node(rsp, rnp) {
                cond_resched_rcu_qs();
                mask = 0;
-               raw_spin_lock_irqsave(&rnp->lock, flags);
-               smp_mb__after_unlock_lock();
+               raw_spin_lock_irqsave_rcu_node(rnp, flags);
                if (rnp->qsmask == 0) {
                        if (rcu_state_p == &rcu_sched_state ||
                            rsp != rcu_state_p ||
@@ -2861,8 +2887,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
        /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
 
        /* Reached the root of the rcu_node tree, acquire lock. */
-       raw_spin_lock_irqsave(&rnp_old->lock, flags);
-       smp_mb__after_unlock_lock();
+       raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
        raw_spin_unlock(&rnp_old->fqslock);
        if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
                rsp->n_force_qs_lh++;
@@ -2985,8 +3010,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct 
rcu_data *rdp,
                if (!rcu_gp_in_progress(rsp)) {
                        struct rcu_node *rnp_root = rcu_get_root(rsp);
 
-                       raw_spin_lock(&rnp_root->lock);
-                       smp_mb__after_unlock_lock();
+                       raw_spin_lock_rcu_node(rnp_root);
                        needwake = rcu_start_gp(rsp);
                        raw_spin_unlock(&rnp_root->lock);
                        if (needwake)
@@ -3925,8 +3949,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
         */
        rnp = rdp->mynode;
        mask = rdp->grpmask;
-       raw_spin_lock(&rnp->lock);              /* irqs already disabled. */
-       smp_mb__after_unlock_lock();
+       raw_spin_lock_rcu_node(rnp);            /* irqs already disabled. */
        rnp->qsmaskinitnext |= mask;
        rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
        rdp->completed = rnp->completed;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to