On Thu, Feb 25, 2016 at 11:20:34AM +0100, Peter Zijlstra wrote: > On Thu, Feb 25, 2016 at 10:07:06AM +0000, Juri Lelli wrote: > > Argh, this makes lot of sense to me. I've actually pondered a tree/list > > solution, but then decided to try the cumulative approach because it > > looked nicer. But it contains holes, I'm afraid. As Luca already said, > > GRUB shouldn't have these problems though. > > > > I'll try and see what introducting a list of blocked/throttled deadline > > tasks means, considering also the interaction with cpusets and such. > > Maybe it's simpler than it seems. > > > > I'm not sure this will come anytime soon, unfortunately. I'm almost 100% > > on the sched-freq/schedutil discussion these days. > > Just skip sleep and write them when its dark outside :-) > > > Anyway, do you also think that what we want to solve the root domain > > issue is something based on rq_online/offline and per-rq information? > > Everything else that I tried or thought of was broken/more horrible. :-/ > > I was still trying to get my head around this, the above was my > suggestion to the per-rq state, but I've not thought hard on alternative > approaches to the root_domain issue.
So the below is the inactive list; it seems to not insta-explode when I run a few simple dl proglets. I don't particularly like it because it makes wakeups (esp. cross-cpu ones) more expensive for the benefit of hotplug/cpusets which is something that 'never' happens. So what I'm going to try and do is forget all about this here patch and see what I can do with a full task-list iteration on rebuild. But I figured that since I wrote it and it might work, I might as well post it. --- include/linux/sched.h | 5 ++ kernel/sched/core.c | 6 ++- kernel/sched/deadline.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 7 ++- 5 files changed, 132 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index c617ea12c6b7..d9848eac35f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1351,6 +1351,11 @@ struct sched_dl_entity { * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; + +#ifdef CONFIG_SMP + struct list_head dl_inactive_entry; + int dl_inactive_cpu; +#endif }; union rcu_special { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0b21e7a724e1..7f3fab6349a4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1162,7 +1162,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p); + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_event_task_migrate(p); } @@ -2077,6 +2077,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->dl.rb_node); init_dl_task_timer(&p->dl); __dl_clear_params(p); +#ifdef CONFIG_SMP + INIT_LIST_HEAD(&p->dl.dl_inactive_entry); +#endif INIT_LIST_HEAD(&p->rt.run_list); p->rt.timeout = 0; @@ -5397,6 +5400,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) migrate_tasks(rq); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); + migrate_inactive_dl(rq); break; case CPU_DEAD: diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c7a036facbe1..f999b8bb6fea 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -80,6 +80,9 @@ void init_dl_rq(struct dl_rq *dl_rq) dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT; + + raw_spin_lock_init(&dl_rq->dl_inactive_lock); + INIT_LIST_HEAD(&dl_rq->dl_inactive_list); #else init_dl_bw(&dl_rq->dl_bw); #endif @@ -289,6 +292,62 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p return later_rq; } +static void enqueue_inactive(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + raw_spin_lock(&dl_rq->dl_inactive_lock); + WRITE_ONCE(dl_se->dl_inactive_cpu, rq_of_dl_rq(dl_rq)->cpu); + list_add(&dl_se->dl_inactive_entry, &dl_rq->dl_inactive_list); + raw_spin_unlock(&dl_rq->dl_inactive_lock); +} + +static void dequeue_inactive(struct sched_dl_entity *dl_se) +{ + int tmp, cpu = READ_ONCE(dl_se->dl_inactive_cpu); + struct rq *rq; + +again: + if (cpu == -1) + return; + rq = cpu_rq(cpu); + + raw_spin_lock(&rq->dl.dl_inactive_lock); + tmp = READ_ONCE(dl_se->dl_inactive_cpu); + if (cpu != tmp) { + cpu = tmp; + raw_spin_unlock(&rq->dl.dl_inactive_lock); + goto again; + } + list_del_init(&dl_se->dl_inactive_entry); + WRITE_ONCE(dl_se->dl_inactive_cpu, -1); + raw_spin_unlock(&rq->dl.dl_inactive_lock); +} + +static void migrate_inactive(struct sched_dl_entity *dl_se, int new_cpu) +{ + int tmp, cpu = READ_ONCE(dl_se->dl_inactive_cpu); + struct rq *src_rq, *dst_rq; + + dst_rq = cpu_rq(new_cpu); +again: + if (cpu == -1) + return; + src_rq = cpu_rq(cpu); + + double_raw_lock(&src_rq->dl.dl_inactive_lock, + &dst_rq->dl.dl_inactive_lock); + tmp = READ_ONCE(dl_se->dl_inactive_cpu); + if (cpu != tmp) { + cpu = tmp; + raw_spin_unlock(&dst_rq->dl.dl_inactive_lock); + raw_spin_unlock(&src_rq->dl.dl_inactive_lock); + goto again; + } + list_move(&dl_se->dl_inactive_entry, &dst_rq->dl.dl_inactive_list); + WRITE_ONCE(dl_se->dl_inactive_cpu, new_cpu); + raw_spin_unlock(&dst_rq->dl.dl_inactive_lock); + raw_spin_unlock(&src_rq->dl.dl_inactive_lock); +} + #else static inline @@ -327,6 +386,11 @@ static inline void queue_push_tasks(struct rq *rq) static inline void queue_pull_task(struct rq *rq) { } + +static inline void enqueue_inactive(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { } +static inline void dequeue_inactive(struct sched_dl_entity *dl_se) { } +static inline void migrate_inactive(struct sched_dl_entity *dl_se, int new_cpu) { } + #endif /* CONFIG_SMP */ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); @@ -960,6 +1024,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) return; + if (!(flags & ENQUEUE_RESTORE)) + dequeue_inactive(&p->dl); + enqueue_dl_entity(&p->dl, pi_se, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) @@ -970,6 +1037,8 @@ static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { dequeue_dl_entity(&p->dl); dequeue_pushable_dl_task(rq, p); + if (!(flags & DEQUEUE_SAVE)) + enqueue_inactive(&p->dl, &rq->dl); } static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -1074,6 +1143,34 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) resched_curr(rq); } +static void migrate_task_rq_dl(struct task_struct *p, int new_cpu) +{ + struct sched_dl_entity *dl_se = &p->dl; + + if (list_empty(&dl_se->dl_inactive_entry)) + return; + + migrate_inactive(dl_se, new_cpu); +} + +void migrate_inactive_dl(struct rq *src_rq) +{ + int cpu = cpumask_any_and(src_rq->rd->online, cpu_active_mask); + struct rq *dst_rq = cpu_rq(cpu); + struct sched_dl_entity *dl_se, *tmp; + + double_raw_lock(&src_rq->dl.dl_inactive_lock, + &dst_rq->dl.dl_inactive_lock); + + list_for_each_entry_safe(dl_se, tmp, &src_rq->dl.dl_inactive_list, dl_inactive_entry) { + WRITE_ONCE(dl_se->dl_inactive_cpu, cpu); + list_move(&dl_se->dl_inactive_entry, &dst_rq->dl.dl_inactive_list); + } + + raw_spin_unlock(&dst_rq->dl.dl_inactive_lock); + raw_spin_unlock(&src_rq->dl.dl_inactive_lock); +} + #endif /* CONFIG_SMP */ /* @@ -1211,13 +1308,19 @@ static void task_dead_dl(struct task_struct *p) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + local_irq_disable(); + /* * Since we are TASK_DEAD we won't slip out of the domain! */ - raw_spin_lock_irq(&dl_b->lock); + raw_spin_lock(&dl_b->lock); /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; - raw_spin_unlock_irq(&dl_b->lock); + raw_spin_unlock(&dl_b->lock); + + dequeue_inactive(&p->dl); + + local_irq_enable(); } static void set_curr_task_dl(struct rq *rq) @@ -1702,7 +1805,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * this is the right place to try to pull some other one * from an overloaded cpu, if any. */ - if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) + if (!task_on_rq_queued(p)) { + dequeue_inactive(&p->dl); + return; + } + + if (rq->dl.dl_nr_running) return; queue_pull_task(rq); @@ -1728,6 +1836,9 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) resched_curr(rq); #endif } + + if (!task_on_rq_queued(p)) + enqueue_inactive(&p->dl, &rq->dl); } /* @@ -1779,6 +1890,7 @@ const struct sched_class dl_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, + .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 303d6392b389..04e856a85c0f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5231,7 +5231,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f * cfs_rq_of(p) references at time of call are still valid and identify the * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ -static void migrate_task_rq_fair(struct task_struct *p) +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { /* * We are supposed to update the task to "current" time, then its up to date diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e6d4a3fa3660..0de1e2894d22 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -517,6 +517,9 @@ struct dl_rq { */ struct rb_root pushable_dl_tasks_root; struct rb_node *pushable_dl_tasks_leftmost; + + raw_spinlock_t dl_inactive_lock; + struct list_head dl_inactive_list; #else struct dl_bw dl_bw; #endif @@ -776,6 +779,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *); #ifdef CONFIG_SMP +extern void migrate_inactive_dl(struct rq *src_rq); + static inline void queue_balance_callback(struct rq *rq, struct callback_head *head, @@ -1205,7 +1210,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); - void (*migrate_task_rq)(struct task_struct *p); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task);