[GIT PULL] scheduler fixes

Ingo Molnar Thu, 24 Mar 2016 00:52:30 -0700

Linus,

Please pull the latest sched-urgent-for-linus git tree from:


   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 73e6aafd9ea81498d31361f01db84a0118da2d1c sched/cpuacct: Simplify the 
cpuacct code

Misc fixes: a cgroup fix, a fair-scheduler migration accounting fix, a cputime 
fix 
and two cpuacct cleanups.

 Thanks,

        Ingo

------------------>
Dongsheng Yang (1):
      sched/cpuacct: Rename parameter in cpuusage_write() for readability

Matt Fleming (1):
      sched/fair: Add comments to explain select_idle_sibling()

Peter Zijlstra (2):
      sched/cgroup: Fix/cleanup cgroup teardown/init
      sched/fair: Fix fairness issue on migration

Thomas Gleixner (1):
      sched/cputime: Fix steal time accounting vs. CPU hotplug

Zhao Lei (1):
      sched/cpuacct: Simplify the cpuacct code


 kernel/sched/core.c    | 36 +++++++++++++++---------------------
 kernel/sched/cpuacct.c | 35 ++++++++++-------------------------
 kernel/sched/cpuacct.h |  4 ++--
 kernel/sched/fair.c    | 39 ++++++++++++++++++++++++++++++++-------
 kernel/sched/sched.h   | 13 +++++++++++++
 5 files changed, 72 insertions(+), 55 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ea8f49ae0062..2a87bdde8d4e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5369,6 +5369,7 @@ migration_call(struct notifier_block *nfb, unsigned long 
action, void *hcpu)
 
        case CPU_UP_PREPARE:
                rq->calc_load_update = calc_load_update;
+               account_reset_rq(rq);
                break;
 
        case CPU_ONLINE:
@@ -7535,7 +7536,7 @@ void set_curr_task(int cpu, struct task_struct *p)
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 
-static void free_sched_group(struct task_group *tg)
+static void sched_free_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
        free_rt_sched_group(tg);
@@ -7561,7 +7562,7 @@ struct task_group *sched_create_group(struct task_group 
*parent)
        return tg;
 
 err:
-       free_sched_group(tg);
+       sched_free_group(tg);
        return ERR_PTR(-ENOMEM);
 }
 
@@ -7581,17 +7582,16 @@ void sched_online_group(struct task_group *tg, struct 
task_group *parent)
 }
 
 /* rcu callback to free various structures associated with a task group */
-static void free_sched_group_rcu(struct rcu_head *rhp)
+static void sched_free_group_rcu(struct rcu_head *rhp)
 {
        /* now it should be safe to free those cfs_rqs */
-       free_sched_group(container_of(rhp, struct task_group, rcu));
+       sched_free_group(container_of(rhp, struct task_group, rcu));
 }
 
-/* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
        /* wait for possible concurrent references to cfs_rqs complete */
-       call_rcu(&tg->rcu, free_sched_group_rcu);
+       call_rcu(&tg->rcu, sched_free_group_rcu);
 }
 
 void sched_offline_group(struct task_group *tg)
@@ -8050,31 +8050,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state 
*parent_css)
        if (IS_ERR(tg))
                return ERR_PTR(-ENOMEM);
 
+       sched_online_group(tg, parent);
+
        return &tg->css;
 }
 
-static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
 {
        struct task_group *tg = css_tg(css);
-       struct task_group *parent = css_tg(css->parent);
 
-       if (parent)
-               sched_online_group(tg, parent);
-       return 0;
+       sched_offline_group(tg);
 }
 
 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
 {
        struct task_group *tg = css_tg(css);
 
-       sched_destroy_group(tg);
-}
-
-static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
-{
-       struct task_group *tg = css_tg(css);
-
-       sched_offline_group(tg);
+       /*
+        * Relies on the RCU grace period between css_released() and this.
+        */
+       sched_free_group(tg);
 }
 
 static void cpu_cgroup_fork(struct task_struct *task)
@@ -8434,9 +8429,8 @@ static struct cftype cpu_files[] = {
 
 struct cgroup_subsys cpu_cgrp_subsys = {
        .css_alloc      = cpu_cgroup_css_alloc,
+       .css_released   = cpu_cgroup_css_released,
        .css_free       = cpu_cgroup_css_free,
-       .css_online     = cpu_cgroup_css_online,
-       .css_offline    = cpu_cgroup_css_offline,
        .fork           = cpu_cgroup_fork,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dd7cbb55bbf2..434c2fa41352 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, 
struct cftype *cft)
 }
 
 static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
-                         u64 reset)
+                         u64 val)
 {
        struct cpuacct *ca = css_ca(css);
        int err = 0;
        int i;
 
-       if (reset) {
+       /*
+        * Only allow '0' here to do a reset.
+        */
+       if (val) {
                err = -EINVAL;
                goto out;
        }
@@ -235,23 +238,10 @@ static struct cftype files[] = {
 void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
        struct cpuacct *ca;
-       int cpu;
-
-       cpu = task_cpu(tsk);
 
        rcu_read_lock();
-
-       ca = task_ca(tsk);
-
-       while (true) {
-               u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-               *cpuusage += cputime;
-
-               ca = parent_ca(ca);
-               if (!ca)
-                       break;
-       }
-
+       for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
+               *this_cpu_ptr(ca->cpuusage) += cputime;
        rcu_read_unlock();
 }
 
@@ -260,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
  *
  * Note: it's the caller that updates the account of the root cgroup.
  */
-void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
 {
-       struct kernel_cpustat *kcpustat;
        struct cpuacct *ca;
 
        rcu_read_lock();
-       ca = task_ca(p);
-       while (ca != &root_cpuacct) {
-               kcpustat = this_cpu_ptr(ca->cpustat);
-               kcpustat->cpustat[index] += val;
-               ca = parent_ca(ca);
-       }
+       for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
+               this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
        rcu_read_unlock();
 }
 
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
index ed605624a5e7..ba72807c73d4 100644
--- a/kernel/sched/cpuacct.h
+++ b/kernel/sched/cpuacct.h
@@ -1,7 +1,7 @@
 #ifdef CONFIG_CGROUP_CPUACCT
 
 extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
 
 #else
 
@@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, 
u64 cputime)
 }
 
 static inline void
-cpuacct_account_field(struct task_struct *p, int index, u64 val)
+cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
 {
 }
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 33130529e9b5..303d6392b389 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3157,17 +3157,25 @@ static inline void check_schedstat_required(void)
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+       bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
+       bool curr = cfs_rq->curr == se;
+
        /*
-        * Update the normalized vruntime before updating min_vruntime
-        * through calling update_curr().
+        * If we're the current task, we must renormalise before calling
+        * update_curr().
         */
-       if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+       if (renorm && curr)
                se->vruntime += cfs_rq->min_vruntime;
 
+       update_curr(cfs_rq);
+
        /*
-        * Update run-time statistics of the 'current'.
+        * Otherwise, renormalise after, such that we're placed at the current
+        * moment in time, instead of some random moment in the past.
         */
-       update_curr(cfs_rq);
+       if (renorm && !curr)
+               se->vruntime += cfs_rq->min_vruntime;
+
        enqueue_entity_load_avg(cfs_rq, se);
        account_entity_enqueue(cfs_rq, se);
        update_cfs_shares(cfs_rq);
@@ -3183,7 +3191,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
                update_stats_enqueue(cfs_rq, se);
                check_spread(cfs_rq, se);
        }
-       if (se != cfs_rq->curr)
+       if (!curr)
                __enqueue_entity(cfs_rq, se);
        se->on_rq = 1;
 
@@ -5047,7 +5055,19 @@ static int select_idle_sibling(struct task_struct *p, 
int target)
                return i;
 
        /*
-        * Otherwise, iterate the domains and find an elegible idle cpu.
+        * Otherwise, iterate the domains and find an eligible idle cpu.
+        *
+        * A completely idle sched group at higher domains is more
+        * desirable than an idle group at a lower level, because lower
+        * domains have smaller groups and usually share hardware
+        * resources which causes tasks to contend on them, e.g. x86
+        * hyperthread siblings in the lowest domain (SMT) can contend
+        * on the shared cpu pipeline.
+        *
+        * However, while we prefer idle groups at higher domains
+        * finding an idle cpu at the lowest domain is still better than
+        * returning 'target', which we've already established, isn't
+        * idle.
         */
        sd = rcu_dereference(per_cpu(sd_llc, target));
        for_each_lower_domain(sd) {
@@ -5057,11 +5077,16 @@ static int select_idle_sibling(struct task_struct *p, 
int target)
                                                tsk_cpus_allowed(p)))
                                goto next;
 
+                       /* Ensure the entire group is idle */
                        for_each_cpu(i, sched_group_cpus(sg)) {
                                if (i == target || !idle_cpu(i))
                                        goto next;
                        }
 
+                       /*
+                        * It doesn't matter which cpu we pick, the
+                        * whole group is idle.
+                        */
                        target = cpumask_first_and(sched_group_cpus(sg),
                                        tsk_cpus_allowed(p));
                        goto done;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2ff5a2bd6df..e6d4a3fa3660 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1793,3 +1793,16 @@ static inline u64 irq_time_read(int cpu)
 }
 #endif /* CONFIG_64BIT */
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+static inline void account_reset_rq(struct rq *rq)
+{
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       rq->prev_irq_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT
+       rq->prev_steal_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+       rq->prev_steal_time_rq = 0;
+#endif
+}

[GIT PULL] scheduler fixes

Reply via email to