Re: [PATCH] sched: tg_set_cfs_bandwidth() causes rq->lock deadlock

Peter Zijlstra Tue, 20 May 2014 07:57:13 -0700

On Tue, May 20, 2014 at 04:01:55PM +0200, Peter Zijlstra wrote:

> Hmm,. doesn't this also mean its entirely unsafe to call
> hrtimer_forward*() from the timer callback, because it might be changing
> the time of an already enqueued timer, which would corrupt the rb-tree
> order.
> 
> Lemme go find a nice way out of this mess, I think I'm responsible for
> creating it in the first place :-(


Ah, so since the hrtimer doesn't provide any serialization of its own,
per design, we should serialize the access to it ourselves, which is
easily done.

Something like so would do I suppose..

Here we make sure to be holding the bandwidth lock (be it cfs_b->lock or
rt_b->rt_runtime_lock) around both starting the timer and in the handler
while modifying its expiration date.

---
 kernel/hrtimer.c     |  9 +++++---
 kernel/sched/core.c  | 12 ++++++----
 kernel/sched/fair.c  | 65 +++++++++++-----------------------------------------
 kernel/sched/rt.c    | 14 +++++------
 kernel/sched/sched.h |  4 ++--
 5 files changed, 34 insertions(+), 70 deletions(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 3ab28993f6e0..28942c65635e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1273,11 +1273,14 @@ static void __run_hrtimer(struct hrtimer *timer, 
ktime_t *now)
         * Note: We clear the CALLBACK bit after enqueue_hrtimer and
         * we do not reprogramm the event hardware. Happens either in
         * hrtimer_start_range_ns() or in hrtimer_interrupt()
+        *
+        * Note: Because we dropped the cpu_base->lock above,
+        * hrtimer_start_range_ns() can have popped in and enqueued the timer
+        * for us already.
         */
-       if (restart != HRTIMER_NORESTART) {
-               BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+       if (restart != HRTIMER_NORESTART &&
+           !(timer->state & HRTIMER_STATE_ENQUEUED))
                enqueue_hrtimer(timer, base);
-       }
 
        WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4ea7b3f1a087..1f4602993d37 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -106,17 +106,17 @@ void __smp_mb__after_atomic(void)
 EXPORT_SYMBOL(__smp_mb__after_atomic);
 #endif
 
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
+int start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
        unsigned long delta;
-       ktime_t soft, hard, now;
+       ktime_t soft, hard;
+       int overruns = 0;
 
        for (;;) {
-               if (hrtimer_active(period_timer))
+               if (hrtimer_is_queued(period_timer))
                        break;
 
-               now = hrtimer_cb_get_time(period_timer);
-               hrtimer_forward(period_timer, now, period);
+               overruns += hrtimer_forward_now(period_timer, period);
 
                soft = hrtimer_get_softexpires(period_timer);
                hard = hrtimer_get_expires(period_timer);
@@ -124,6 +124,8 @@ void start_bandwidth_timer(struct hrtimer *period_timer, 
ktime_t period)
                __hrtimer_start_range_ns(period_timer, soft, delta,
                                         HRTIMER_MODE_ABS_PINNED, 0);
        }
+
+       return overruns;
 }
 
 DEFINE_MUTEX(sched_domains_mutex);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 28ccf502c63c..11152b621a31 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3146,15 +3146,13 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
                amount = min_amount;
        else {
                /*
-                * If the bandwidth pool has become inactive, then at least one
+                * If we had overruns starting the timer, then at least one
                 * period must have elapsed since the last consumption.
                 * Refresh the global state and ensure bandwidth timer becomes
                 * active.
                 */
-               if (!cfs_b->timer_active) {
+               if (start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period))
                        __refill_cfs_bandwidth_runtime(cfs_b);
-                       __start_cfs_bandwidth(cfs_b);
-               }
 
                if (cfs_b->runtime > 0) {
                        amount = min(cfs_b->runtime, min_amount);
@@ -3331,8 +3329,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        cfs_rq->throttled_clock = rq_clock(rq);
        raw_spin_lock(&cfs_b->lock);
        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-       if (!cfs_b->timer_active)
-               __start_cfs_bandwidth(cfs_b);
+       start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
        raw_spin_unlock(&cfs_b->lock);
 }
 
@@ -3430,12 +3427,13 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth 
*cfs_b,
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 {
        u64 runtime, runtime_expires;
-       int idle = 1, throttled;
+       bool idle, throttled;
+
+       lockdep_assert_held(&cfs_b->lock);
 
-       raw_spin_lock(&cfs_b->lock);
        /* no need to continue the timer with no bandwidth constraint */
        if (cfs_b->quota == RUNTIME_INF)
-               goto out_unlock;
+               return 1;
 
        throttled = !list_empty(&cfs_b->throttled_cfs_rq);
        /* idle depends on !throttled (for the case of a large deficit) */
@@ -3444,21 +3442,14 @@ static int do_sched_cfs_period_timer(struct 
cfs_bandwidth *cfs_b, int overrun)
 
        /* if we're going inactive then everything else can be deferred */
        if (idle)
-               goto out_unlock;
-
-       /*
-        * if we have relooped after returning idle once, we need to update our
-        * status as actually running, so that other cpus doing
-        * __start_cfs_bandwidth will stop trying to cancel us.
-        */
-       cfs_b->timer_active = 1;
+               return 1;
 
        __refill_cfs_bandwidth_runtime(cfs_b);
 
        if (!throttled) {
                /* mark as potentially idle for the upcoming period */
                cfs_b->idle = 1;
-               goto out_unlock;
+               return 0;
        }
 
        /* account preceding periods in which throttling occurred */
@@ -3498,12 +3489,7 @@ static int do_sched_cfs_period_timer(struct 
cfs_bandwidth *cfs_b, int overrun)
         * timer to remain active while there are any throttled entities.)
         */
        cfs_b->idle = 0;
-out_unlock:
-       if (idle)
-               cfs_b->timer_active = 0;
-       raw_spin_unlock(&cfs_b->lock);
-
-       return idle;
+       return 0;
 }
 
 /* a cfs_rq won't donate quota below this amount */
@@ -3676,19 +3662,18 @@ static enum hrtimer_restart 
sched_cfs_period_timer(struct hrtimer *timer)
 {
        struct cfs_bandwidth *cfs_b =
                container_of(timer, struct cfs_bandwidth, period_timer);
-       ktime_t now;
        int overrun;
        int idle = 0;
 
+       raw_spin_lock(&cfs_b->lock);
        for (;;) {
-               now = hrtimer_cb_get_time(timer);
-               overrun = hrtimer_forward(timer, now, cfs_b->period);
-
+               overrun = hrtimer_forward_now(timer, cfs_b->period);
                if (!overrun)
                        break;
 
                idle = do_sched_cfs_period_timer(cfs_b, overrun);
        }
+       raw_spin_unlock(&cfs_b->lock);
 
        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 }
@@ -3713,30 +3698,6 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        INIT_LIST_HEAD(&cfs_rq->throttled_list);
 }
 
-/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
-       /*
-        * The timer may be active because we're trying to set a new bandwidth
-        * period or because we're racing with the tear-down path
-        * (timer_active==0 becomes visible before the hrtimer call-back
-        * terminates).  In either case we ensure that it's re-programmed
-        */
-       while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
-              hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
-               /* bounce the lock to allow do_sched_cfs_period_timer to run */
-               raw_spin_unlock(&cfs_b->lock);
-               cpu_relax();
-               raw_spin_lock(&cfs_b->lock);
-               /* if someone else restarted the timer then we're done */
-               if (cfs_b->timer_active)
-                       return;
-       }
-
-       cfs_b->timer_active = 1;
-       start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
-}
-
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
        hrtimer_cancel(&cfs_b->period_timer);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7795e292f4c9..60bfb8caa27a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -17,19 +17,20 @@ static enum hrtimer_restart sched_rt_period_timer(struct 
hrtimer *timer)
 {
        struct rt_bandwidth *rt_b =
                container_of(timer, struct rt_bandwidth, rt_period_timer);
-       ktime_t now;
-       int overrun;
        int idle = 0;
+       int overrun;
 
+       raw_spin_lock(&rt_b->rt_runtime_lock);
        for (;;) {
-               now = hrtimer_cb_get_time(timer);
-               overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-
+               overrun = hrtimer_forward_now(timer, rt_b->rt_period);
                if (!overrun)
                        break;
 
+               raw_spin_unlock(&rt_b->rt_runtime_lock);
                idle = do_sched_rt_period_timer(rt_b, overrun);
+               raw_spin_lock(&rt_b->rt_runtime_lock);
        }
+       raw_spin_unlock(&rt_b->rt_runtime_lock);
 
        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 }
@@ -51,9 +52,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                return;
 
-       if (hrtimer_active(&rt_b->rt_period_timer))
-               return;
-
        raw_spin_lock(&rt_b->rt_runtime_lock);
        start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
        raw_spin_unlock(&rt_b->rt_runtime_lock);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2cbe81308af..0937e341e079 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -187,7 +187,7 @@ struct cfs_bandwidth {
        s64 hierarchal_quota;
        u64 runtime_expires;
 
-       int idle, timer_active;
+       int idle;
        struct hrtimer period_timer, slack_timer;
        struct list_head throttled_cfs_rq;
 
@@ -1288,7 +1288,7 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 
rt_delta) { }
 static inline void sched_avg_update(struct rq *rq) { }
 #endif
 
-extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t 
period);
+extern int start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT

pgpCYjzqfzXtP.pgp
Description: PGP signature

Re: [PATCH] sched: tg_set_cfs_bandwidth() causes rq->lock deadlock

Reply via email to