deadline: add per rq tracking of admitted bandwidth

Peter Zijlstra Thu, 24 Mar 2016 02:21:58 -0700

On Thu, Feb 25, 2016 at 11:20:34AM +0100, Peter Zijlstra wrote:
> On Thu, Feb 25, 2016 at 10:07:06AM +0000, Juri Lelli wrote:
> > Argh, this makes lot of sense to me. I've actually pondered a tree/list
> > solution, but then decided to try the cumulative approach because it
> > looked nicer. But it contains holes, I'm afraid. As Luca already said,
> > GRUB shouldn't have these problems though.
> > 
> > I'll try and see what introducting a list of blocked/throttled deadline
> > tasks means, considering also the interaction with cpusets and such.
> > Maybe it's simpler than it seems.
> > 
> > I'm not sure this will come anytime soon, unfortunately. I'm almost 100%
> > on the sched-freq/schedutil discussion these days.
> 
> Just skip sleep and write them when its dark outside :-)
> 
> > Anyway, do you also think that what we want to solve the root domain
> > issue is something based on rq_online/offline and per-rq information?
> > Everything else that I tried or thought of was broken/more horrible. :-/
> 
> I was still trying to get my head around this, the above was my
> suggestion to the per-rq state, but I've not thought hard on alternative
> approaches to the root_domain issue.


So the below is the inactive list; it seems to not insta-explode when I
run a few simple dl proglets.

I don't particularly like it because it makes wakeups (esp. cross-cpu
ones) more expensive for the benefit of hotplug/cpusets which is
something that 'never' happens.

So what I'm going to try and do is forget all about this here patch and
see what I can do with a full task-list iteration on rebuild. But I
figured that since I wrote it and it might work, I might as well post
it.

---
 include/linux/sched.h   |   5 ++
 kernel/sched/core.c     |   6 ++-
 kernel/sched/deadline.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/fair.c     |   2 +-
 kernel/sched/sched.h    |   7 ++-
 5 files changed, 132 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c617ea12c6b7..d9848eac35f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1351,6 +1351,11 @@ struct sched_dl_entity {
         * own bandwidth to be enforced, thus we need one timer per task.
         */
        struct hrtimer dl_timer;
+
+#ifdef CONFIG_SMP
+       struct list_head dl_inactive_entry;
+       int              dl_inactive_cpu;
+#endif
 };
 
 union rcu_special {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0b21e7a724e1..7f3fab6349a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1162,7 +1162,7 @@ void set_task_cpu(struct task_struct *p, unsigned int 
new_cpu)
 
        if (task_cpu(p) != new_cpu) {
                if (p->sched_class->migrate_task_rq)
-                       p->sched_class->migrate_task_rq(p);
+                       p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
                perf_event_task_migrate(p);
        }
@@ -2077,6 +2077,9 @@ static void __sched_fork(unsigned long clone_flags, 
struct task_struct *p)
        RB_CLEAR_NODE(&p->dl.rb_node);
        init_dl_task_timer(&p->dl);
        __dl_clear_params(p);
+#ifdef CONFIG_SMP
+       INIT_LIST_HEAD(&p->dl.dl_inactive_entry);
+#endif
 
        INIT_LIST_HEAD(&p->rt.run_list);
        p->rt.timeout           = 0;
@@ -5397,6 +5400,7 @@ migration_call(struct notifier_block *nfb, unsigned long 
action, void *hcpu)
                migrate_tasks(rq);
                BUG_ON(rq->nr_running != 1); /* the migration thread */
                raw_spin_unlock_irqrestore(&rq->lock, flags);
+               migrate_inactive_dl(rq);
                break;
 
        case CPU_DEAD:
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c7a036facbe1..f999b8bb6fea 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -80,6 +80,9 @@ void init_dl_rq(struct dl_rq *dl_rq)
        dl_rq->dl_nr_migratory = 0;
        dl_rq->overloaded = 0;
        dl_rq->pushable_dl_tasks_root = RB_ROOT;
+
+       raw_spin_lock_init(&dl_rq->dl_inactive_lock);
+       INIT_LIST_HEAD(&dl_rq->dl_inactive_list);
 #else
        init_dl_bw(&dl_rq->dl_bw);
 #endif
@@ -289,6 +292,62 @@ static struct rq *dl_task_offline_migration(struct rq *rq, 
struct task_struct *p
        return later_rq;
 }
 
+static void enqueue_inactive(struct sched_dl_entity *dl_se, struct dl_rq 
*dl_rq)
+{
+       raw_spin_lock(&dl_rq->dl_inactive_lock);
+       WRITE_ONCE(dl_se->dl_inactive_cpu, rq_of_dl_rq(dl_rq)->cpu);
+       list_add(&dl_se->dl_inactive_entry, &dl_rq->dl_inactive_list);
+       raw_spin_unlock(&dl_rq->dl_inactive_lock);
+}
+
+static void dequeue_inactive(struct sched_dl_entity *dl_se)
+{
+       int tmp, cpu = READ_ONCE(dl_se->dl_inactive_cpu);
+       struct rq *rq;
+
+again:
+       if (cpu == -1)
+               return;
+       rq = cpu_rq(cpu);
+
+       raw_spin_lock(&rq->dl.dl_inactive_lock);
+       tmp = READ_ONCE(dl_se->dl_inactive_cpu);
+       if (cpu != tmp) {
+               cpu = tmp;
+               raw_spin_unlock(&rq->dl.dl_inactive_lock);
+               goto again;
+       }
+       list_del_init(&dl_se->dl_inactive_entry);
+       WRITE_ONCE(dl_se->dl_inactive_cpu, -1);
+       raw_spin_unlock(&rq->dl.dl_inactive_lock);
+}
+
+static void migrate_inactive(struct sched_dl_entity *dl_se, int new_cpu)
+{
+       int tmp, cpu = READ_ONCE(dl_se->dl_inactive_cpu);
+       struct rq *src_rq, *dst_rq;
+
+       dst_rq = cpu_rq(new_cpu);
+again:
+       if (cpu == -1)
+               return;
+       src_rq = cpu_rq(cpu);
+
+       double_raw_lock(&src_rq->dl.dl_inactive_lock,
+                       &dst_rq->dl.dl_inactive_lock);
+       tmp = READ_ONCE(dl_se->dl_inactive_cpu);
+       if (cpu != tmp) {
+               cpu = tmp;
+               raw_spin_unlock(&dst_rq->dl.dl_inactive_lock);
+               raw_spin_unlock(&src_rq->dl.dl_inactive_lock);
+               goto again;
+       }
+       list_move(&dl_se->dl_inactive_entry, &dst_rq->dl.dl_inactive_list);
+       WRITE_ONCE(dl_se->dl_inactive_cpu, new_cpu);
+       raw_spin_unlock(&dst_rq->dl.dl_inactive_lock);
+       raw_spin_unlock(&src_rq->dl.dl_inactive_lock);
+}
+
 #else
 
 static inline
@@ -327,6 +386,11 @@ static inline void queue_push_tasks(struct rq *rq)
 static inline void queue_pull_task(struct rq *rq)
 {
 }
+
+static inline void enqueue_inactive(struct sched_dl_entity *dl_se, struct 
dl_rq *dl_rq) { }
+static inline void dequeue_inactive(struct sched_dl_entity *dl_se) { }
+static inline void migrate_inactive(struct sched_dl_entity *dl_se, int 
new_cpu) { }
+
 #endif /* CONFIG_SMP */
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -960,6 +1024,9 @@ static void enqueue_task_dl(struct rq *rq, struct 
task_struct *p, int flags)
        if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
                return;
 
+       if (!(flags & ENQUEUE_RESTORE))
+               dequeue_inactive(&p->dl);
+
        enqueue_dl_entity(&p->dl, pi_se, flags);
 
        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -970,6 +1037,8 @@ static void __dequeue_task_dl(struct rq *rq, struct 
task_struct *p, int flags)
 {
        dequeue_dl_entity(&p->dl);
        dequeue_pushable_dl_task(rq, p);
+       if (!(flags & DEQUEUE_SAVE))
+               enqueue_inactive(&p->dl, &rq->dl);
 }
 
 static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -1074,6 +1143,34 @@ static void check_preempt_equal_dl(struct rq *rq, struct 
task_struct *p)
        resched_curr(rq);
 }
 
+static void migrate_task_rq_dl(struct task_struct *p, int new_cpu)
+{
+       struct sched_dl_entity *dl_se = &p->dl;
+
+       if (list_empty(&dl_se->dl_inactive_entry))
+               return;
+
+       migrate_inactive(dl_se, new_cpu);
+}
+
+void migrate_inactive_dl(struct rq *src_rq)
+{
+       int cpu = cpumask_any_and(src_rq->rd->online, cpu_active_mask);
+       struct rq *dst_rq = cpu_rq(cpu);
+       struct sched_dl_entity *dl_se, *tmp;
+
+       double_raw_lock(&src_rq->dl.dl_inactive_lock,
+                       &dst_rq->dl.dl_inactive_lock);
+
+       list_for_each_entry_safe(dl_se, tmp, &src_rq->dl.dl_inactive_list, 
dl_inactive_entry) {
+               WRITE_ONCE(dl_se->dl_inactive_cpu, cpu);
+               list_move(&dl_se->dl_inactive_entry, 
&dst_rq->dl.dl_inactive_list);
+       }
+
+       raw_spin_unlock(&dst_rq->dl.dl_inactive_lock);
+       raw_spin_unlock(&src_rq->dl.dl_inactive_lock);
+}
+
 #endif /* CONFIG_SMP */
 
 /*
@@ -1211,13 +1308,19 @@ static void task_dead_dl(struct task_struct *p)
 {
        struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
+       local_irq_disable();
+
        /*
         * Since we are TASK_DEAD we won't slip out of the domain!
         */
-       raw_spin_lock_irq(&dl_b->lock);
+       raw_spin_lock(&dl_b->lock);
        /* XXX we should retain the bw until 0-lag */
        dl_b->total_bw -= p->dl.dl_bw;
-       raw_spin_unlock_irq(&dl_b->lock);
+       raw_spin_unlock(&dl_b->lock);
+
+       dequeue_inactive(&p->dl);
+
+       local_irq_enable();
 }
 
 static void set_curr_task_dl(struct rq *rq)
@@ -1702,7 +1805,12 @@ static void switched_from_dl(struct rq *rq, struct 
task_struct *p)
         * this is the right place to try to pull some other one
         * from an overloaded cpu, if any.
         */
-       if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+       if (!task_on_rq_queued(p)) {
+               dequeue_inactive(&p->dl);
+               return;
+       }
+
+       if (rq->dl.dl_nr_running)
                return;
 
        queue_pull_task(rq);
@@ -1728,6 +1836,9 @@ static void switched_to_dl(struct rq *rq, struct 
task_struct *p)
                        resched_curr(rq);
 #endif
        }
+
+       if (!task_on_rq_queued(p))
+               enqueue_inactive(&p->dl, &rq->dl);
 }
 
 /*
@@ -1779,6 +1890,7 @@ const struct sched_class dl_sched_class = {
 
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_dl,
+       .migrate_task_rq        = migrate_task_rq_dl,
        .set_cpus_allowed       = set_cpus_allowed_dl,
        .rq_online              = rq_online_dl,
        .rq_offline             = rq_offline_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 303d6392b389..04e856a85c0f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5231,7 +5231,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, 
int sd_flag, int wake_f
  * cfs_rq_of(p) references at time of call are still valid and identify the
  * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
  */
-static void migrate_task_rq_fair(struct task_struct *p)
+static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 {
        /*
         * We are supposed to update the task to "current" time, then its up to 
date
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e6d4a3fa3660..0de1e2894d22 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -517,6 +517,9 @@ struct dl_rq {
         */
        struct rb_root pushable_dl_tasks_root;
        struct rb_node *pushable_dl_tasks_leftmost;
+
+       raw_spinlock_t   dl_inactive_lock;
+       struct list_head dl_inactive_list;
 #else
        struct dl_bw dl_bw;
 #endif
@@ -776,6 +779,8 @@ extern int migrate_swap(struct task_struct *, struct 
task_struct *);
 
 #ifdef CONFIG_SMP
 
+extern void migrate_inactive_dl(struct rq *src_rq);
+
 static inline void
 queue_balance_callback(struct rq *rq,
                       struct callback_head *head,
@@ -1205,7 +1210,7 @@ struct sched_class {
 
 #ifdef CONFIG_SMP
        int  (*select_task_rq)(struct task_struct *p, int task_cpu, int 
sd_flag, int flags);
-       void (*migrate_task_rq)(struct task_struct *p);
+       void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
 
        void (*task_waking) (struct task_struct *task);
        void (*task_woken) (struct rq *this_rq, struct task_struct *task);

Re: [PATCH 1/2] sched/deadline: add per rq tracking of admitted bandwidth

Reply via email to