On 2020/7/20 15:23, benbjiang(蒋彪) wrote: > Hi, > >> On Jul 20, 2020, at 2:06 PM, Li, Aubrey <aubrey...@linux.intel.com >> <mailto:aubrey...@linux.intel.com>> wrote: >> >> On 2020/7/20 12:06, benbjiang(蒋彪) wrote: >>> Hi, >>> >>>> On Jul 1, 2020, at 5:32 AM, Vineeth Remanan Pillai >>>> <vpil...@digitalocean.com <mailto:vpil...@digitalocean.com>> wrote: >>>> >>>> From: Peter Zijlstra <pet...@infradead.org <mailto:pet...@infradead.org>> >>>> >>>> When a sibling is forced-idle to match the core-cookie; search for >>>> matching tasks to fill the core. >>>> >>>> rcu_read_unlock() can incur an infrequent deadlock in >>>> sched_core_balance(). Fix this by using the RCU-sched flavor instead. >>>> >>>> Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org >>>> <mailto:pet...@infradead.org>> >>>> Signed-off-by: Joel Fernandes (Google) <j...@joelfernandes.org >>>> <mailto:j...@joelfernandes.org>> >>>> Acked-by: Paul E. McKenney <paul...@kernel.org <mailto:paul...@kernel.org>> >>>> --- >>>> include/linux/sched.h | 1 + >>>> kernel/sched/core.c | 131 +++++++++++++++++++++++++++++++++++++++++- >>>> kernel/sched/idle.c | 1 + >>>> kernel/sched/sched.h | 6 ++ >>>> 4 files changed, 138 insertions(+), 1 deletion(-) >>>> >>>> diff --git a/include/linux/sched.h b/include/linux/sched.h >>>> index 3c8dcc5ff039..4f9edf013df3 100644 >>>> --- a/include/linux/sched.h >>>> +++ b/include/linux/sched.h >>>> @@ -688,6 +688,7 @@ struct task_struct { >>>> #ifdef CONFIG_SCHED_CORE >>>> struct rb_nodecore_node; >>>> unsigned longcore_cookie; >>>> +unsigned intcore_occupation; >>>> #endif >>>> >>>> #ifdef CONFIG_CGROUP_SCHED >>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c >>>> index 4d6d6a678013..fb9edb09ead7 100644 >>>> --- a/kernel/sched/core.c >>>> +++ b/kernel/sched/core.c >>>> @@ -201,6 +201,21 @@ static struct task_struct *sched_core_find(struct rq >>>> *rq, unsigned long cookie) >>>> return match; >>>> } >>>> >>>> +static struct task_struct *sched_core_next(struct task_struct *p, >>>> unsigned long cookie) >>>> +{ >>>> +struct rb_node *node = &p->core_node; >>>> + >>>> +node = rb_next(node); >>>> +if (!node) >>>> +return NULL; >>>> + >>>> +p = container_of(node, struct task_struct, core_node); >>>> +if (p->core_cookie != cookie) >>>> +return NULL; >>>> + >>>> +return p; >>>> +} >>>> + >>>> /* >>>> * The static-key + stop-machine variable are needed such that: >>>> * >>>> @@ -4233,7 +4248,7 @@ pick_next_task(struct rq *rq, struct task_struct >>>> *prev, struct rq_flags *rf) >>>> struct task_struct *next, *max = NULL; >>>> const struct sched_class *class; >>>> const struct cpumask *smt_mask; >>>> -int i, j, cpu; >>>> +int i, j, cpu, occ = 0; >>>> bool need_sync; >>>> >>>> if (!sched_core_enabled(rq)) >>>> @@ -4332,6 +4347,9 @@ pick_next_task(struct rq *rq, struct task_struct >>>> *prev, struct rq_flags *rf) >>>> goto done; >>>> } >>>> >>>> +if (!is_idle_task(p)) >>>> +occ++; >>>> + >>>> rq_i->core_pick = p; >>>> >>>> /* >>>> @@ -4357,6 +4375,7 @@ pick_next_task(struct rq *rq, struct task_struct >>>> *prev, struct rq_flags *rf) >>>> >>>> cpu_rq(j)->core_pick = NULL; >>>> } >>>> +occ = 1; >>>> goto again; >>>> } else { >>>> /* >>>> @@ -4393,6 +4412,8 @@ next_class:; >>>> if (is_idle_task(rq_i->core_pick) && rq_i->nr_running) >>>> rq_i->core_forceidle = true; >>>> >>>> +rq_i->core_pick->core_occupation = occ; >>>> + >>>> if (i == cpu) >>>> continue; >>>> >>>> @@ -4408,6 +4429,114 @@ next_class:; >>>> return next; >>>> } >>>> >>>> +static bool try_steal_cookie(int this, int that) >>>> +{ >>>> +struct rq *dst = cpu_rq(this), *src = cpu_rq(that); >>>> +struct task_struct *p; >>>> +unsigned long cookie; >>>> +bool success = false; >>>> + >>>> +local_irq_disable(); >>>> +double_rq_lock(dst, src); >>>> + >>>> +cookie = dst->core->core_cookie; >>>> +if (!cookie) >>>> +goto unlock; >>>> + >>>> +if (dst->curr != dst->idle) >>>> +goto unlock; >>>> + >>>> +p = sched_core_find(src, cookie); >>>> +if (p == src->idle) >>>> +goto unlock; >>>> + >>>> +do { >>>> +if (p == src->core_pick || p == src->curr) >>>> +goto next; >>>> + >>>> +if (!cpumask_test_cpu(this, &p->cpus_mask)) >>>> +goto next; >>>> + >>>> +if (p->core_occupation > dst->idle->core_occupation) >>>> +goto next; >>>> + >>>> +p->on_rq = TASK_ON_RQ_MIGRATING; >>>> +deactivate_task(src, p, 0); >>>> +set_task_cpu(p, this); >>>> +activate_task(dst, p, 0); >>>> +p->on_rq = TASK_ON_RQ_QUEUED; >>>> + >>>> +resched_curr(dst); >>>> + >>>> +success = true; >>>> +break; >>>> + >>>> +next: >>>> +p = sched_core_next(p, cookie); >>>> +} while (p); >>>> + >>>> +unlock: >>>> +double_rq_unlock(dst, src); >>>> +local_irq_enable(); >>>> + >>>> +return success; >>>> +} >>>> + >>>> +static bool steal_cookie_task(int cpu, struct sched_domain *sd) >>>> +{ >>>> +int i; >>>> + >>>> +for_each_cpu_wrap(i, sched_domain_span(sd), cpu) { >>> Since (i == cpu) should be skipped, should we start iteration at cpu+1? >>> like, >>> for_each_cpu_wrap(i, sched_domain_span(sd), cpu+1) { >>> … >>> } >>> In that way, we could avoid hitting following if(i == cpu) always. >> >> IMHO, this won't work, as cpuid is not continuous. > Cpuid may be not continuous, but for_each_cpu_wrap() could cover the case, I > think. :)
And for_each_cpu_wrap() will still wrap around and pick i == cpu, even though it starts from (cpu+1)... Thanks, -Aubrey > >> >>>> +if (i == cpu) >>>> +continue; >>>> + >>>> +if (need_resched()) >>>> +break; >>> Should we return true here to accelerate the breaking of >>> sched_core_balance? >>> Otherwise the breaking would be delayed to the next level sd iteration. >>>> + >>>> +if (try_steal_cookie(cpu, i)) >>>> +return true; >>>> +} >>>> + >>>> +return false; >>>> +} >>>> + >>>> +static void sched_core_balance(struct rq *rq) >>>> +{ >>>> +struct sched_domain *sd; >>>> +int cpu = cpu_of(rq); >>>> + >>>> +rcu_read_lock_sched(); >>>> +raw_spin_unlock_irq(rq_lockp(rq)); >>>> +for_each_domain(cpu, sd) { >>>> +if (!(sd->flags & SD_LOAD_BALANCE)) >>>> +break; >>>> + >>>> +if (need_resched()) >>>> +break; >>> If rescheded here, we missed the chance to do further forced-newidle >>> balance, >>> and the idle-core could be idle for a long time, because lacking of pulling >>> chance. >>> Could it be possible to add a new forced-newidle balance chance in >>> task_tick_idle? >>> which could make it more efficient. >> >> This flag indicates there is another thread deserves to run, So I guess the >> core won't >> be idle for a long time. >> >> Thanks, >> -Aubrey > Indeed, thanks for the explanation. :) > >>> >>>> +if (steal_cookie_task(cpu, sd)) >>>> +break; >>>> +} >>>> +raw_spin_lock_irq(rq_lockp(rq)); >>>> +rcu_read_unlock_sched(); >>>> +} >>>> + >>>> +static DEFINE_PER_CPU(struct callback_head, core_balance_head); >>>> + >>>> +void queue_core_balance(struct rq *rq) >>>> +{ >>>> +if (!sched_core_enabled(rq)) >>>> +return; >>>> + >>>> +if (!rq->core->core_cookie) >>>> +return; >>>> + >>>> +if (!rq->nr_running) /* not forced idle */ >>>> +return; >>>> + >>>> +queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), >>>> sched_core_balance); >>>> +} >>>> + >>>> #else /* !CONFIG_SCHED_CORE */ >>>> >>>> static struct task_struct * >>>> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c >>>> index a8d40ffab097..dff6ba220ed7 100644 >>>> --- a/kernel/sched/idle.c >>>> +++ b/kernel/sched/idle.c >>>> @@ -395,6 +395,7 @@ static void set_next_task_idle(struct rq *rq, struct >>>> task_struct *next, bool fir >>>> { >>>> update_idle_core(rq); >>>> schedstat_inc(rq->sched_goidle); >>>> +queue_core_balance(rq); >>>> } >>>> >>>> #ifdef CONFIG_SMP >>>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h >>>> index 293aa1ae0308..464559676fd2 100644 >>>> --- a/kernel/sched/sched.h >>>> +++ b/kernel/sched/sched.h >>>> @@ -1089,6 +1089,8 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq) >>>> bool cfs_prio_less(struct task_struct *a, struct task_struct *b); >>>> void sched_core_adjust_sibling_vruntime(int cpu, bool coresched_enabled); >>>> >>>> +extern void queue_core_balance(struct rq *rq); >>>> + >>>> #else /* !CONFIG_SCHED_CORE */ >>>> >>>> static inline bool sched_core_enabled(struct rq *rq) >>>> @@ -1101,6 +1103,10 @@ static inline raw_spinlock_t *rq_lockp(struct rq >>>> *rq) >>>> return &rq->__lock; >>>> } >>>> >>>> +static inline void queue_core_balance(struct rq *rq) >>>> +{ >>>> +} >>>> + >>>> #endif /* CONFIG_SCHED_CORE */ >>>> >>>> #ifdef CONFIG_SCHED_SMT >>>> -- >>>> 2.17.1 >