Author: Vladimir Davydov Email: [email protected] Subject: sched: make nr_cpus limit support hierarchies Date: Wed, 26 Nov 2014 17:29:31 +0300
Before the patch nr_cpus limiter was completely unaware of hierarchies, i.e. creation of a cpu sub-cgroup in a container would result in its tasks being spread over all physical cpus irrespective of the container's nr_cpus limit. This patch changes this. It makes sub-cgroups created inside a container's cpu cgroup respect the containers nr_cpus limit. For example, if a container has nr_cpus=2, all its tasks should be running on 2 physical cpus most of time even if some of the tasks were moved into a cpu sub-cgroup. However, nr_cpus inside a cpu sub-cgroup of a container still do not limit parallelism, only the total cpu time granted to the sub-cgroup, because implementation of fully hierarchical nr_cpus limit will be likely to impact performance much. Signed-off-by: Vladimir Davydov <[email protected]> ============================================================================= Related to https://jira.sw.ru/browse/PSBM-33642 Signed-off-by: Vladimir Davydov <[email protected]> --- kernel/sched/fair.c | 157 ++++++++++++++++++++++++++++++++++++--------------- kernel/sched/sched.h | 1 + 2 files changed, 111 insertions(+), 47 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ecac940ddebd..25df08082a6b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -263,6 +263,21 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } +static inline bool is_top_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct sched_entity *se; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + return se && !se->parent; +} + +static inline struct cfs_rq *top_cfs_rq_of(struct sched_entity *se) +{ + while (se->parent && se->parent->parent) + se = se->parent; + return cfs_rq_of(se); +} + static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update); @@ -391,6 +406,16 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return NULL; } +static inline bool is_top_cfs_rq(struct cfs_rq *cfs_rq) +{ + return false; +} + +static inline struct cfs_rq *top_cfs_rq_of(struct sched_entity *se) +{ + return cfs_rq_of(se); +} + static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } @@ -3010,6 +3035,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + if (is_top_cfs_rq(cfs_rq) && !cfs_rq->load.weight) + inc_nr_active_cfs_rqs(cfs_rq); + /* * Update the normalized vruntime before updating min_vruntime * through callig update_curr(). @@ -3130,6 +3158,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq); + + if (is_top_cfs_rq(cfs_rq) && !cfs_rq->load.weight) + dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP); } /* @@ -4111,10 +4142,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int boost = check_enqueue_boost(rq, p, flags); - cfs_rq = task_cfs_rq(p); - if (list_empty(&cfs_rq->tasks)) - inc_nr_active_cfs_rqs(cfs_rq); - for_each_sched_entity(se) { if (se->on_rq) break; @@ -4182,6 +4209,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) int boosted = entity_boosted(se); int task_sleep = flags & DEQUEUE_SLEEP; + if (task_sleep) + flags |= DEQUEUE_TASK_SLEEP; + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -4234,10 +4264,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, 1); } hrtick_update(rq); - - cfs_rq = task_cfs_rq(p); - if (list_empty(&cfs_rq->tasks)) - dec_nr_active_cfs_rqs(cfs_rq, task_sleep); } #ifdef CONFIG_SMP @@ -4685,31 +4711,37 @@ done: return target; } -static inline int cpu_is_runnable(struct task_struct *p, int cpu) -{ - return cfs_rq_active(task_cfs_rq(p)->tg->cfs_rq[cpu]); -} - -static int select_runnable_cpu(struct task_struct *p, int new_cpu) +static inline bool select_runnable_cpu(struct task_struct *p, int *new_cpu) { + struct cfs_rq *cfs_rq; + struct task_group *tg; struct sched_domain *sd; int prev_cpu = task_cpu(p); int cpu; - if (cpu_is_runnable(p, new_cpu)) - return new_cpu; + cfs_rq = top_cfs_rq_of(&p->se); + if (check_cpulimit_spread(cfs_rq, *new_cpu) > 0) + return false; - if (cpu_is_runnable(p, prev_cpu)) - return prev_cpu; + tg = cfs_rq->tg; + + if (cfs_rq_active(tg->cfs_rq[*new_cpu])) + return true; + + if (cfs_rq_active(tg->cfs_rq[prev_cpu])) { + *new_cpu = prev_cpu; + return true; + } - for_each_domain(new_cpu, sd) { + for_each_domain(*new_cpu, sd) { for_each_cpu_and(cpu, sched_domain_span(sd), &p->cpus_allowed) { - if (cpu_is_runnable(p, cpu)) - return cpu; + if (cfs_rq_active(tg->cfs_rq[cpu])) { + *new_cpu = cpu; + return true; + } } } - - return new_cpu; + return false; } /* @@ -4767,10 +4799,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = prev_cpu; } - if (check_cpulimit_spread(task_cfs_rq(p), new_cpu) <= 0) { - new_cpu = select_runnable_cpu(p, new_cpu); + if (select_runnable_cpu(p, &new_cpu)) goto unlock; - } if (affine_sd) { new_cpu = select_idle_sibling(p, new_cpu); @@ -5047,21 +5077,33 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) #if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT) static int cpulimit_balance_cpu_stop(void *data); -static void trigger_cpulimit_balance(struct rq *this_rq, struct task_struct *p) +static inline void trigger_cpulimit_balance(struct task_struct *p) { - int this_cpu = cpu_of(this_rq); - int cpu, target_cpu = -1; + struct rq *this_rq; + struct cfs_rq *cfs_rq; + int this_cpu, cpu, target_cpu = -1; struct sched_domain *sd; + if (!p->se.on_rq) + return; + + this_rq = rq_of(cfs_rq_of(&p->se)); + this_cpu = cpu_of(this_rq); + + cfs_rq = top_cfs_rq_of(&p->se); + if (check_cpulimit_spread(cfs_rq, this_cpu) >= 0) + return; + raw_spin_unlock(&this_rq->lock); rcu_read_lock(); for_each_domain(this_cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; - for_each_cpu_and(cpu, sched_domain_span(sd), tsk_cpus_allowed(p)) { + for_each_cpu_and(cpu, sched_domain_span(sd), + tsk_cpus_allowed(p)) { if (cpu != this_cpu && - cfs_rq_active(task_cfs_rq(p)->tg->cfs_rq[cpu])) { + cfs_rq_active(cfs_rq->tg->cfs_rq[cpu])) { target_cpu = cpu; goto unlock; } @@ -5084,8 +5126,7 @@ unlock: } } #else -static inline void trigger_cpulimit_balance(struct rq *this_rq, - struct task_struct *p) +static inline void trigger_cpulimit_balance(struct task_struct *p) { } #endif @@ -5103,9 +5144,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) put_prev_entity(cfs_rq, se); } - if (prev->se.on_rq && - check_cpulimit_spread(task_cfs_rq(prev), cpu_of(rq)) < 0) - trigger_cpulimit_balance(rq, prev); + trigger_cpulimit_balance(prev); } /* @@ -5435,27 +5474,27 @@ static inline bool migrate_degrades_locality(struct task_struct *p, static int can_migrate_task(struct task_struct *p, struct lb_env *env) { + struct cfs_rq *cfs_rq = top_cfs_rq_of(&p->se); int tsk_cache_hot = 0; - if (check_cpulimit_spread(task_cfs_rq(p), env->dst_cpu) < 0) { + if (check_cpulimit_spread(cfs_rq, env->dst_cpu) < 0) { int cpu; schedstat_inc(p, se.statistics.nr_failed_migrations_cpulimit); - if (check_cpulimit_spread(task_cfs_rq(p), env->src_cpu) != 0) + if (check_cpulimit_spread(cfs_rq, env->src_cpu) != 0) return 0; if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) return 0; for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cfs_rq_active(task_cfs_rq(p)->tg->cfs_rq[cpu])) { + if (cfs_rq_active(cfs_rq->tg->cfs_rq[cpu])) { env->flags |= LBF_SOME_PINNED; env->new_dst_cpu = cpu; break; } } - return 0; } @@ -5683,7 +5722,8 @@ static int move_task_group(struct cfs_rq *cfs_rq, struct lb_env *env) static int move_task_groups(struct lb_env *env) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *top_cfs_rq; + struct task_group *tg; unsigned long load; int cur_pulled, pulled = 0; @@ -5691,11 +5731,25 @@ static int move_task_groups(struct lb_env *env) return 0; for_each_leaf_cfs_rq(env->src_rq, cfs_rq) { - if (check_cpulimit_spread(cfs_rq, env->src_cpu) != 0 || - cfs_rq_active(cfs_rq->tg->cfs_rq[env->dst_cpu])) + tg = cfs_rq->tg; + if (tg == &root_task_group) + continue; + /* + * A child always goes before its parent in a leaf_cfs_rq_list. + * Therefore, if we encounter a cfs_rq that has a child cfs_rq, + * we could not migrate the child and therefore we should not + * even try to migrate the parent. + */ + if (cfs_rq->nr_running != cfs_rq->h_nr_running) + continue; + + top_cfs_rq = is_top_cfs_rq(cfs_rq) ? cfs_rq : + top_cfs_rq_of(tg->se[env->src_cpu]); + if (check_cpulimit_spread(top_cfs_rq, env->src_cpu) != 0 || + cfs_rq_active(top_cfs_rq->tg->cfs_rq[env->dst_cpu])) continue; - load = entity_h_load(cfs_rq->tg->se[env->src_cpu]); + load = entity_h_load(top_cfs_rq->tg->se[env->src_cpu]); if ((load / 2) > env->imbalance) continue; @@ -5718,12 +5772,21 @@ static int move_task_groups(struct lb_env *env) static int do_cpulimit_balance(struct lb_env *env) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *top_cfs_rq; + struct task_group *tg; int pushed = 0; for_each_leaf_cfs_rq(env->src_rq, cfs_rq) { - if (check_cpulimit_spread(cfs_rq, env->src_cpu) < 0 && - cfs_rq_active(cfs_rq->tg->cfs_rq[env->dst_cpu]) && + tg = cfs_rq->tg; + if (tg == &root_task_group) + continue; + /* see move_task_groups for why we skip such groups */ + if (cfs_rq->nr_running != cfs_rq->h_nr_running) + continue; + top_cfs_rq = is_top_cfs_rq(cfs_rq) ? cfs_rq : + top_cfs_rq_of(tg->se[env->src_cpu]); + if (check_cpulimit_spread(top_cfs_rq, env->src_cpu) < 0 && + cfs_rq_active(top_cfs_rq->tg->cfs_rq[env->dst_cpu]) && can_migrate_task_group(cfs_rq, env)) pushed += move_task_group(cfs_rq, env); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c4f513bc668b..8fac301ff828 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1043,6 +1043,7 @@ static const u32 prio_to_wmult[40] = { #define ENQUEUE_BOOST 8 #define DEQUEUE_SLEEP 1 +#define DEQUEUE_TASK_SLEEP 2 struct sched_class { const struct sched_class *next; -- 2.1.4 _______________________________________________ Devel mailing list [email protected] https://lists.openvz.org/mailman/listinfo/devel
