Linus,

please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

This scheduler update provides:

   - The (hopefully) final fix for the vtime accounting issues which
     were around for quite some time

   - Use types known to user space in UAPI headers to unbreak user space
     builds

   - Make load balancing respect the current scheduling domain again
     instead of evaluating unrelated CPUs.

Thanks,

        tglx

------------------>
Dmitry V. Levin (1):
      sched/headers/uapi: Fix linux/sched/types.h userspace compilation errors

Frederic Weisbecker (4):
      vtime, sched/cputime: Remove vtime_account_user()
      sched/cputime: Always set tsk->vtime_snap_whence after accounting vtime
      sched/cputime: Rename vtime fields
      sched/cputime: Move the vtime task fields to their own struct

Ingo Molnar (1):
      Revert "sched/cputime: Refactor the cputime_adjust() code"

Jeffrey Hugo (1):
      sched/fair: Fix load_balance() affinity redo path

Wanpeng Li (1):
      sched/cputime: Accumulate vtime on top of nsec clocksource


 include/linux/init_task.h        |   6 +-
 include/linux/sched.h            |  29 ++++---
 include/linux/vtime.h            |   9 +-
 include/uapi/linux/sched/types.h |  16 ++--
 kernel/fork.c                    |   6 +-
 kernel/sched/cputime.c           | 180 ++++++++++++++++++++++++---------------
 kernel/sched/fair.c              |  32 ++++---
 7 files changed, 165 insertions(+), 113 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e049526bc188..a2f6707e9fc0 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -170,9 +170,9 @@ extern struct cred init_cred;
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 # define INIT_VTIME(tsk)                                               \
-       .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
-       .vtime_snap = 0,                                \
-       .vtime_snap_whence = VTIME_SYS,
+       .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount),              \
+       .vtime.starttime = 0,                                           \
+       .vtime.state = VTIME_SYS,
 #else
 # define INIT_VTIME(tsk)
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c4ca7433d9d..4818126c5153 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -223,6 +223,24 @@ struct task_cputime {
 #define prof_exp                       stime
 #define sched_exp                      sum_exec_runtime
 
+enum vtime_state {
+       /* Task is sleeping or running in a CPU with VTIME inactive: */
+       VTIME_INACTIVE = 0,
+       /* Task runs in userspace in a CPU with VTIME active: */
+       VTIME_USER,
+       /* Task runs in kernelspace in a CPU with VTIME active: */
+       VTIME_SYS,
+};
+
+struct vtime {
+       seqcount_t              seqcount;
+       unsigned long long      starttime;
+       enum vtime_state        state;
+       u64                     utime;
+       u64                     stime;
+       u64                     gtime;
+};
+
 struct sched_info {
 #ifdef CONFIG_SCHED_INFO
        /* Cumulative counters: */
@@ -688,16 +706,7 @@ struct task_struct {
        u64                             gtime;
        struct prev_cputime             prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-       seqcount_t                      vtime_seqcount;
-       unsigned long long              vtime_snap;
-       enum {
-               /* Task is sleeping or running in a CPU with VTIME inactive: */
-               VTIME_INACTIVE = 0,
-               /* Task runs in userspace in a CPU with VTIME active: */
-               VTIME_USER,
-               /* Task runs in kernelspace in a CPU with VTIME active: */
-               VTIME_SYS,
-       } vtime_snap_whence;
+       struct vtime                    vtime;
 #endif
 
 #ifdef CONFIG_NO_HZ_FULL
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 0681fe25abeb..18b405e3cd93 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -67,19 +67,12 @@ static inline void vtime_account_system(struct task_struct 
*tsk) { }
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void arch_vtime_task_switch(struct task_struct *tsk);
-extern void vtime_account_user(struct task_struct *tsk);
 extern void vtime_user_enter(struct task_struct *tsk);
-
-static inline void vtime_user_exit(struct task_struct *tsk)
-{
-       vtime_account_user(tsk);
-}
-
+extern void vtime_user_exit(struct task_struct *tsk);
 extern void vtime_guest_enter(struct task_struct *tsk);
 extern void vtime_guest_exit(struct task_struct *tsk);
 extern void vtime_init_idle(struct task_struct *tsk, int cpu);
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
-static inline void vtime_account_user(struct task_struct *tsk) { }
 static inline void vtime_user_enter(struct task_struct *tsk) { }
 static inline void vtime_user_exit(struct task_struct *tsk) { }
 static inline void vtime_guest_enter(struct task_struct *tsk) { }
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index 307acbc82d80..34b81aa1a2f7 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -54,21 +54,21 @@ struct sched_param {
  * available in the scheduling class file or in Documentation/.
  */
 struct sched_attr {
-       u32 size;
+       __u32 size;
 
-       u32 sched_policy;
-       u64 sched_flags;
+       __u32 sched_policy;
+       __u64 sched_flags;
 
        /* SCHED_NORMAL, SCHED_BATCH */
-       s32 sched_nice;
+       __s32 sched_nice;
 
        /* SCHED_FIFO, SCHED_RR */
-       u32 sched_priority;
+       __u32 sched_priority;
 
        /* SCHED_DEADLINE */
-       u64 sched_runtime;
-       u64 sched_deadline;
-       u64 sched_period;
+       __u64 sched_runtime;
+       __u64 sched_deadline;
+       __u64 sched_period;
 };
 
 #endif /* _UAPI_LINUX_SCHED_TYPES_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index e53770d2bf95..d927ec11aa7a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1637,9 +1637,9 @@ static __latent_entropy struct task_struct *copy_process(
        prev_cputime_init(&p->prev_cputime);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-       seqcount_init(&p->vtime_seqcount);
-       p->vtime_snap = 0;
-       p->vtime_snap_whence = VTIME_INACTIVE;
+       seqcount_init(&p->vtime.seqcount);
+       p->vtime.starttime = 0;
+       p->vtime.state = VTIME_INACTIVE;
 #endif
 
 #if defined(SPLIT_RSS_COUNTING)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 67c70e287647..6e3ea4ac1bda 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr,
        utime = curr->utime;
 
        /*
-        * If either stime or both stime and utime are 0, assume all runtime is
-        * userspace. Once a task gets some ticks, the monotonicy code at
-        * 'update' will ensure things converge to the observed ratio.
+        * If either stime or utime are 0, assume all runtime is userspace.
+        * Once a task gets some ticks, the monotonicy code at 'update:'
+        * will ensure things converge to the observed ratio.
         */
-       if (stime != 0) {
-               if (utime == 0)
-                       stime = rtime;
-               else
-                       stime = scale_stime(stime, rtime, stime + utime);
+       if (stime == 0) {
+               utime = rtime;
+               goto update;
        }
 
+       if (utime == 0) {
+               stime = rtime;
+               goto update;
+       }
+
+       stime = scale_stime(stime, rtime, stime + utime);
+
+update:
        /*
         * Make sure stime doesn't go backwards; this preserves monotonicity
         * for utime because rtime is monotonic.
@@ -673,20 +679,21 @@ void thread_group_cputime_adjusted(struct task_struct *p, 
u64 *ut, u64 *st)
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static u64 vtime_delta(struct task_struct *tsk)
+static u64 vtime_delta(struct vtime *vtime)
 {
-       unsigned long now = READ_ONCE(jiffies);
+       unsigned long long clock;
 
-       if (time_before(now, (unsigned long)tsk->vtime_snap))
+       clock = sched_clock_cpu(smp_processor_id());
+       if (clock < vtime->starttime)
                return 0;
 
-       return jiffies_to_nsecs(now - tsk->vtime_snap);
+       return clock - vtime->starttime;
 }
 
-static u64 get_vtime_delta(struct task_struct *tsk)
+static u64 get_vtime_delta(struct vtime *vtime)
 {
-       unsigned long now = READ_ONCE(jiffies);
-       u64 delta, other;
+       u64 delta = vtime_delta(vtime);
+       u64 other;
 
        /*
         * Unlike tick based timing, vtime based timing never has lost
@@ -695,104 +702,138 @@ static u64 get_vtime_delta(struct task_struct *tsk)
         * elapsed time. Limit account_other_time to prevent rounding
         * errors from causing elapsed vtime to go negative.
         */
-       delta = jiffies_to_nsecs(now - tsk->vtime_snap);
        other = account_other_time(delta);
-       WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
-       tsk->vtime_snap = now;
+       WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
+       vtime->starttime += delta;
 
        return delta - other;
 }
 
-static void __vtime_account_system(struct task_struct *tsk)
+static void __vtime_account_system(struct task_struct *tsk,
+                                  struct vtime *vtime)
+{
+       vtime->stime += get_vtime_delta(vtime);
+       if (vtime->stime >= TICK_NSEC) {
+               account_system_time(tsk, irq_count(), vtime->stime);
+               vtime->stime = 0;
+       }
+}
+
+static void vtime_account_guest(struct task_struct *tsk,
+                               struct vtime *vtime)
 {
-       account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
+       vtime->gtime += get_vtime_delta(vtime);
+       if (vtime->gtime >= TICK_NSEC) {
+               account_guest_time(tsk, vtime->gtime);
+               vtime->gtime = 0;
+       }
 }
 
 void vtime_account_system(struct task_struct *tsk)
 {
-       if (!vtime_delta(tsk))
+       struct vtime *vtime = &tsk->vtime;
+
+       if (!vtime_delta(vtime))
                return;
 
-       write_seqcount_begin(&tsk->vtime_seqcount);
-       __vtime_account_system(tsk);
-       write_seqcount_end(&tsk->vtime_seqcount);
+       write_seqcount_begin(&vtime->seqcount);
+       /* We might have scheduled out from guest path */
+       if (current->flags & PF_VCPU)
+               vtime_account_guest(tsk, vtime);
+       else
+               __vtime_account_system(tsk, vtime);
+       write_seqcount_end(&vtime->seqcount);
 }
 
-void vtime_account_user(struct task_struct *tsk)
+void vtime_user_enter(struct task_struct *tsk)
 {
-       write_seqcount_begin(&tsk->vtime_seqcount);
-       tsk->vtime_snap_whence = VTIME_SYS;
-       if (vtime_delta(tsk))
-               account_user_time(tsk, get_vtime_delta(tsk));
-       write_seqcount_end(&tsk->vtime_seqcount);
+       struct vtime *vtime = &tsk->vtime;
+
+       write_seqcount_begin(&vtime->seqcount);
+       __vtime_account_system(tsk, vtime);
+       vtime->state = VTIME_USER;
+       write_seqcount_end(&vtime->seqcount);
 }
 
-void vtime_user_enter(struct task_struct *tsk)
+void vtime_user_exit(struct task_struct *tsk)
 {
-       write_seqcount_begin(&tsk->vtime_seqcount);
-       if (vtime_delta(tsk))
-               __vtime_account_system(tsk);
-       tsk->vtime_snap_whence = VTIME_USER;
-       write_seqcount_end(&tsk->vtime_seqcount);
+       struct vtime *vtime = &tsk->vtime;
+
+       write_seqcount_begin(&vtime->seqcount);
+       vtime->utime += get_vtime_delta(vtime);
+       if (vtime->utime >= TICK_NSEC) {
+               account_user_time(tsk, vtime->utime);
+               vtime->utime = 0;
+       }
+       vtime->state = VTIME_SYS;
+       write_seqcount_end(&vtime->seqcount);
 }
 
 void vtime_guest_enter(struct task_struct *tsk)
 {
+       struct vtime *vtime = &tsk->vtime;
        /*
         * The flags must be updated under the lock with
-        * the vtime_snap flush and update.
+        * the vtime_starttime flush and update.
         * That enforces a right ordering and update sequence
         * synchronization against the reader (task_gtime())
         * that can thus safely catch up with a tickless delta.
         */
-       write_seqcount_begin(&tsk->vtime_seqcount);
-       if (vtime_delta(tsk))
-               __vtime_account_system(tsk);
+       write_seqcount_begin(&vtime->seqcount);
+       __vtime_account_system(tsk, vtime);
        current->flags |= PF_VCPU;
-       write_seqcount_end(&tsk->vtime_seqcount);
+       write_seqcount_end(&vtime->seqcount);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_enter);
 
 void vtime_guest_exit(struct task_struct *tsk)
 {
-       write_seqcount_begin(&tsk->vtime_seqcount);
-       __vtime_account_system(tsk);
+       struct vtime *vtime = &tsk->vtime;
+
+       write_seqcount_begin(&vtime->seqcount);
+       vtime_account_guest(tsk, vtime);
        current->flags &= ~PF_VCPU;
-       write_seqcount_end(&tsk->vtime_seqcount);
+       write_seqcount_end(&vtime->seqcount);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_exit);
 
 void vtime_account_idle(struct task_struct *tsk)
 {
-       account_idle_time(get_vtime_delta(tsk));
+       account_idle_time(get_vtime_delta(&tsk->vtime));
 }
 
 void arch_vtime_task_switch(struct task_struct *prev)
 {
-       write_seqcount_begin(&prev->vtime_seqcount);
-       prev->vtime_snap_whence = VTIME_INACTIVE;
-       write_seqcount_end(&prev->vtime_seqcount);
+       struct vtime *vtime = &prev->vtime;
+
+       write_seqcount_begin(&vtime->seqcount);
+       vtime->state = VTIME_INACTIVE;
+       write_seqcount_end(&vtime->seqcount);
+
+       vtime = &current->vtime;
 
-       write_seqcount_begin(&current->vtime_seqcount);
-       current->vtime_snap_whence = VTIME_SYS;
-       current->vtime_snap = jiffies;
-       write_seqcount_end(&current->vtime_seqcount);
+       write_seqcount_begin(&vtime->seqcount);
+       vtime->state = VTIME_SYS;
+       vtime->starttime = sched_clock_cpu(smp_processor_id());
+       write_seqcount_end(&vtime->seqcount);
 }
 
 void vtime_init_idle(struct task_struct *t, int cpu)
 {
+       struct vtime *vtime = &t->vtime;
        unsigned long flags;
 
        local_irq_save(flags);
-       write_seqcount_begin(&t->vtime_seqcount);
-       t->vtime_snap_whence = VTIME_SYS;
-       t->vtime_snap = jiffies;
-       write_seqcount_end(&t->vtime_seqcount);
+       write_seqcount_begin(&vtime->seqcount);
+       vtime->state = VTIME_SYS;
+       vtime->starttime = sched_clock_cpu(cpu);
+       write_seqcount_end(&vtime->seqcount);
        local_irq_restore(flags);
 }
 
 u64 task_gtime(struct task_struct *t)
 {
+       struct vtime *vtime = &t->vtime;
        unsigned int seq;
        u64 gtime;
 
@@ -800,13 +841,13 @@ u64 task_gtime(struct task_struct *t)
                return t->gtime;
 
        do {
-               seq = read_seqcount_begin(&t->vtime_seqcount);
+               seq = read_seqcount_begin(&vtime->seqcount);
 
                gtime = t->gtime;
-               if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
-                       gtime += vtime_delta(t);
+               if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+                       gtime += vtime->gtime + vtime_delta(vtime);
 
-       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
+       } while (read_seqcount_retry(&vtime->seqcount, seq));
 
        return gtime;
 }
@@ -818,8 +859,9 @@ u64 task_gtime(struct task_struct *t)
  */
 void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 {
-       u64 delta;
+       struct vtime *vtime = &t->vtime;
        unsigned int seq;
+       u64 delta;
 
        if (!vtime_accounting_enabled()) {
                *utime = t->utime;
@@ -828,25 +870,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 
*stime)
        }
 
        do {
-               seq = read_seqcount_begin(&t->vtime_seqcount);
+               seq = read_seqcount_begin(&vtime->seqcount);
 
                *utime = t->utime;
                *stime = t->stime;
 
                /* Task is sleeping, nothing to add */
-               if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
+               if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
                        continue;
 
-               delta = vtime_delta(t);
+               delta = vtime_delta(vtime);
 
                /*
                 * Task runs either in user or kernel space, add pending nohz 
time to
                 * the right place.
                 */
-               if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
-                       *utime += delta;
-               else if (t->vtime_snap_whence == VTIME_SYS)
-                       *stime += delta;
-       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
+               if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
+                       *utime += vtime->utime + delta;
+               else if (vtime->state == VTIME_SYS)
+                       *stime += vtime->stime + delta;
+       } while (read_seqcount_retry(&vtime->seqcount, seq));
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 008c514dc241..c95880e216f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
                 * our sched_group. We may want to revisit it if we couldn't
                 * meet load balance goals by pulling other tasks on src_cpu.
                 *
-                * Also avoid computing new_dst_cpu if we have already computed
-                * one in current iteration.
+                * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
+                * already computed one in current iteration.
                 */
-               if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+               if (env->idle == CPU_NEWLY_IDLE || (env->flags & 
LBF_DST_PINNED))
                        return 0;
 
                /* Prevent to re-select dst_cpu via env's cpus */
@@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .tasks          = LIST_HEAD_INIT(env.tasks),
        };
 
-       /*
-        * For NEWLY_IDLE load_balancing, we don't need to consider
-        * other cpus in our group
-        */
-       if (idle == CPU_NEWLY_IDLE)
-               env.dst_grpmask = NULL;
-
-       cpumask_copy(cpus, cpu_active_mask);
+       cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
 
        schedstat_inc(sd->lb_count[idle]);
 
@@ -8151,7 +8144,15 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(env.flags & LBF_ALL_PINNED)) {
                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                       if (!cpumask_empty(cpus)) {
+                       /*
+                        * Attempting to continue load balancing at the current
+                        * sched_domain level only makes sense if there are
+                        * active CPUs remaining as possible busiest CPUs to
+                        * pull load from which are not contained within the
+                        * destination group that is receiving any migrated
+                        * load.
+                        */
+                       if (!cpumask_subset(cpus, env.dst_grpmask)) {
                                env.loop = 0;
                                env.loop_break = sched_nr_migrate_break;
                                goto redo;
@@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data)
                        .src_cpu        = busiest_rq->cpu,
                        .src_rq         = busiest_rq,
                        .idle           = CPU_IDLE,
+                       /*
+                        * can_migrate_task() doesn't need to compute 
new_dst_cpu
+                        * for active balancing. Since we have CPU_IDLE, but no
+                        * @dst_grpmask we need to make that test go away with 
lying
+                        * about DST_PINNED.
+                        */
+                       .flags          = LBF_DST_PINNED,
                };
 
                schedstat_inc(sd->alb_count);

Reply via email to