Author: Vladimir Davydov
Email: [email protected]
Subject: sched: make nr_cpus limit support hierarchies
Date: Wed, 26 Nov 2014 17:29:31 +0300

Before the patch nr_cpus limiter was completely unaware of hierarchies,
i.e. creation of a cpu sub-cgroup in a container would result in its
tasks being spread over all physical cpus irrespective of the
container's nr_cpus limit.

This patch changes this. It makes sub-cgroups created inside a
container's cpu cgroup respect the containers nr_cpus limit. For
example, if a container has nr_cpus=2, all its tasks should be running
on 2 physical cpus most of time even if some of the tasks were moved
into a cpu sub-cgroup.

However, nr_cpus inside a cpu sub-cgroup of a container still do not
limit parallelism, only the total cpu time granted to the sub-cgroup,
because implementation of fully hierarchical nr_cpus limit will be
likely to impact performance much.

Signed-off-by: Vladimir Davydov <[email protected]>
=============================================================================

Related to https://jira.sw.ru/browse/PSBM-33642

Signed-off-by: Vladimir Davydov <[email protected]>
---
 kernel/sched/fair.c  | 157 ++++++++++++++++++++++++++++++++++++---------------
 kernel/sched/sched.h |   1 +
 2 files changed, 111 insertions(+), 47 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ecac940ddebd..25df08082a6b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -263,6 +263,21 @@ static inline struct cfs_rq *group_cfs_rq(struct 
sched_entity *grp)
        return grp->my_q;
 }
 
+static inline bool is_top_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       struct sched_entity *se;
+
+       se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+       return se && !se->parent;
+}
+
+static inline struct cfs_rq *top_cfs_rq_of(struct sched_entity *se)
+{
+       while (se->parent && se->parent->parent)
+               se = se->parent;
+       return cfs_rq_of(se);
+}
+
 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
                                       int force_update);
 
@@ -391,6 +406,16 @@ static inline struct cfs_rq *group_cfs_rq(struct 
sched_entity *grp)
        return NULL;
 }
 
+static inline bool is_top_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       return false;
+}
+
+static inline struct cfs_rq *top_cfs_rq_of(struct sched_entity *se)
+{
+       return cfs_rq_of(se);
+}
+
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
@@ -3010,6 +3035,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+       if (is_top_cfs_rq(cfs_rq) && !cfs_rq->load.weight)
+               inc_nr_active_cfs_rqs(cfs_rq);
+
        /*
         * Update the normalized vruntime before updating min_vruntime
         * through callig update_curr().
@@ -3130,6 +3158,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
 
        update_min_vruntime(cfs_rq);
        update_cfs_shares(cfs_rq);
+
+       if (is_top_cfs_rq(cfs_rq) && !cfs_rq->load.weight)
+               dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP);
 }
 
 /*
@@ -4111,10 +4142,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, 
int flags)
        struct sched_entity *se = &p->se;
        int boost = check_enqueue_boost(rq, p, flags);
 
-       cfs_rq = task_cfs_rq(p);
-       if (list_empty(&cfs_rq->tasks))
-               inc_nr_active_cfs_rqs(cfs_rq);
-
        for_each_sched_entity(se) {
                if (se->on_rq)
                        break;
@@ -4182,6 +4209,9 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
        int boosted = entity_boosted(se);
        int task_sleep = flags & DEQUEUE_SLEEP;
 
+       if (task_sleep)
+               flags |= DEQUEUE_TASK_SLEEP;
+
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags);
@@ -4234,10 +4264,6 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
                update_rq_runnable_avg(rq, 1);
        }
        hrtick_update(rq);
-
-       cfs_rq = task_cfs_rq(p);
-       if (list_empty(&cfs_rq->tasks))
-               dec_nr_active_cfs_rqs(cfs_rq, task_sleep);
 }
 
 #ifdef CONFIG_SMP
@@ -4685,31 +4711,37 @@ done:
        return target;
 }
 
-static inline int cpu_is_runnable(struct task_struct *p, int cpu)
-{
-       return cfs_rq_active(task_cfs_rq(p)->tg->cfs_rq[cpu]);
-}
-
-static int select_runnable_cpu(struct task_struct *p, int new_cpu)
+static inline bool select_runnable_cpu(struct task_struct *p, int *new_cpu)
 {
+       struct cfs_rq *cfs_rq;
+       struct task_group *tg;
        struct sched_domain *sd;
        int prev_cpu = task_cpu(p);
        int cpu;
 
-       if (cpu_is_runnable(p, new_cpu))
-               return new_cpu;
+       cfs_rq = top_cfs_rq_of(&p->se);
+       if (check_cpulimit_spread(cfs_rq, *new_cpu) > 0)
+               return false;
 
-       if (cpu_is_runnable(p, prev_cpu))
-               return prev_cpu;
+       tg = cfs_rq->tg;
+
+       if (cfs_rq_active(tg->cfs_rq[*new_cpu]))
+               return true;
+
+       if (cfs_rq_active(tg->cfs_rq[prev_cpu])) {
+               *new_cpu = prev_cpu;
+               return true;
+       }
 
-       for_each_domain(new_cpu, sd) {
+       for_each_domain(*new_cpu, sd) {
                for_each_cpu_and(cpu, sched_domain_span(sd), &p->cpus_allowed) {
-                       if (cpu_is_runnable(p, cpu))
-                               return cpu;
+                       if (cfs_rq_active(tg->cfs_rq[cpu])) {
+                               *new_cpu = cpu;
+                               return true;
+                       }
                }
        }
-
-       return new_cpu;
+       return false;
 }
 
 /*
@@ -4767,10 +4799,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, 
int sd_flag, int wake_f
                        new_cpu = prev_cpu;
        }
 
-       if (check_cpulimit_spread(task_cfs_rq(p), new_cpu) <= 0) {
-               new_cpu = select_runnable_cpu(p, new_cpu);
+       if (select_runnable_cpu(p, &new_cpu))
                goto unlock;
-       }
 
        if (affine_sd) {
                new_cpu = select_idle_sibling(p, new_cpu);
@@ -5047,21 +5077,33 @@ static struct task_struct *pick_next_task_fair(struct 
rq *rq)
 #if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT)
 static int cpulimit_balance_cpu_stop(void *data);
 
-static void trigger_cpulimit_balance(struct rq *this_rq, struct task_struct *p)
+static inline void trigger_cpulimit_balance(struct task_struct *p)
 {
-       int this_cpu = cpu_of(this_rq);
-       int cpu, target_cpu = -1;
+       struct rq *this_rq;
+       struct cfs_rq *cfs_rq;
+       int this_cpu, cpu, target_cpu = -1;
        struct sched_domain *sd;
 
+       if (!p->se.on_rq)
+               return;
+
+       this_rq = rq_of(cfs_rq_of(&p->se));
+       this_cpu = cpu_of(this_rq);
+
+       cfs_rq = top_cfs_rq_of(&p->se);
+       if (check_cpulimit_spread(cfs_rq, this_cpu) >= 0)
+               return;
+
        raw_spin_unlock(&this_rq->lock);
 
        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
-               for_each_cpu_and(cpu, sched_domain_span(sd), 
tsk_cpus_allowed(p)) {
+               for_each_cpu_and(cpu, sched_domain_span(sd),
+                                tsk_cpus_allowed(p)) {
                        if (cpu != this_cpu &&
-                           cfs_rq_active(task_cfs_rq(p)->tg->cfs_rq[cpu])) {
+                           cfs_rq_active(cfs_rq->tg->cfs_rq[cpu])) {
                                target_cpu = cpu;
                                goto unlock;
                        }
@@ -5084,8 +5126,7 @@ unlock:
        }
 }
 #else
-static inline void trigger_cpulimit_balance(struct rq *this_rq,
-                                           struct task_struct *p)
+static inline void trigger_cpulimit_balance(struct task_struct *p)
 {
 }
 #endif
@@ -5103,9 +5144,7 @@ static void put_prev_task_fair(struct rq *rq, struct 
task_struct *prev)
                put_prev_entity(cfs_rq, se);
        }
 
-       if (prev->se.on_rq &&
-           check_cpulimit_spread(task_cfs_rq(prev), cpu_of(rq)) < 0)
-               trigger_cpulimit_balance(rq, prev);
+       trigger_cpulimit_balance(prev);
 }
 
 /*
@@ -5435,27 +5474,27 @@ static inline bool migrate_degrades_locality(struct 
task_struct *p,
 static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
+       struct cfs_rq *cfs_rq = top_cfs_rq_of(&p->se);
        int tsk_cache_hot = 0;
 
-       if (check_cpulimit_spread(task_cfs_rq(p), env->dst_cpu) < 0) {
+       if (check_cpulimit_spread(cfs_rq, env->dst_cpu) < 0) {
                int cpu;
 
                schedstat_inc(p, se.statistics.nr_failed_migrations_cpulimit);
 
-               if (check_cpulimit_spread(task_cfs_rq(p), env->src_cpu) != 0)
+               if (check_cpulimit_spread(cfs_rq, env->src_cpu) != 0)
                        return 0;
 
                if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
                        return 0;
 
                for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
-                       if (cfs_rq_active(task_cfs_rq(p)->tg->cfs_rq[cpu])) {
+                       if (cfs_rq_active(cfs_rq->tg->cfs_rq[cpu])) {
                                env->flags |= LBF_SOME_PINNED;
                                env->new_dst_cpu = cpu;
                                break;
                        }
                }
-
                return 0;
        }
 
@@ -5683,7 +5722,8 @@ static int move_task_group(struct cfs_rq *cfs_rq, struct 
lb_env *env)
 
 static int move_task_groups(struct lb_env *env)
 {
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *top_cfs_rq;
+       struct task_group *tg;
        unsigned long load;
        int cur_pulled, pulled = 0;
 
@@ -5691,11 +5731,25 @@ static int move_task_groups(struct lb_env *env)
                return 0;
 
        for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
-               if (check_cpulimit_spread(cfs_rq, env->src_cpu) != 0 ||
-                   cfs_rq_active(cfs_rq->tg->cfs_rq[env->dst_cpu]))
+               tg = cfs_rq->tg;
+               if (tg == &root_task_group)
+                       continue;
+               /*
+                * A child always goes before its parent in a leaf_cfs_rq_list.
+                * Therefore, if we encounter a cfs_rq that has a child cfs_rq,
+                * we could not migrate the child and therefore we should not
+                * even try to migrate the parent.
+                */
+               if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+                       continue;
+
+               top_cfs_rq = is_top_cfs_rq(cfs_rq) ? cfs_rq :
+                               top_cfs_rq_of(tg->se[env->src_cpu]);
+               if (check_cpulimit_spread(top_cfs_rq, env->src_cpu) != 0 ||
+                   cfs_rq_active(top_cfs_rq->tg->cfs_rq[env->dst_cpu]))
                        continue;
 
-               load = entity_h_load(cfs_rq->tg->se[env->src_cpu]);
+               load = entity_h_load(top_cfs_rq->tg->se[env->src_cpu]);
                if ((load / 2) > env->imbalance)
                        continue;
 
@@ -5718,12 +5772,21 @@ static int move_task_groups(struct lb_env *env)
 
 static int do_cpulimit_balance(struct lb_env *env)
 {
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *top_cfs_rq;
+       struct task_group *tg;
        int pushed = 0;
 
        for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
-               if (check_cpulimit_spread(cfs_rq, env->src_cpu) < 0 &&
-                   cfs_rq_active(cfs_rq->tg->cfs_rq[env->dst_cpu]) &&
+               tg = cfs_rq->tg;
+               if (tg == &root_task_group)
+                       continue;
+               /* see move_task_groups for why we skip such groups */
+               if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+                       continue;
+               top_cfs_rq = is_top_cfs_rq(cfs_rq) ? cfs_rq :
+                               top_cfs_rq_of(tg->se[env->src_cpu]);
+               if (check_cpulimit_spread(top_cfs_rq, env->src_cpu) < 0 &&
+                   cfs_rq_active(top_cfs_rq->tg->cfs_rq[env->dst_cpu]) &&
                    can_migrate_task_group(cfs_rq, env))
                        pushed += move_task_group(cfs_rq, env);
        }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c4f513bc668b..8fac301ff828 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1043,6 +1043,7 @@ static const u32 prio_to_wmult[40] = {
 #define ENQUEUE_BOOST          8
 
 #define DEQUEUE_SLEEP          1
+#define DEQUEUE_TASK_SLEEP     2
 
 struct sched_class {
        const struct sched_class *next;
-- 
2.1.4

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to