In order to correctly implement kcpustat under nohz_full, we need to track the task running on a given CPU and read its vtime state safely, reliably and locklessly.
This leaves us with tracking and fetching that task under RCU. This will be done in a further patch. Until then we need to prepare vtime for handling that properly and close the accounting before we meet the earliest opportunity for the RCU delayed put_task_struct() to be queued. That point happens to be in exit_notify() in case of auto-reaping. Therefore we need to finish the accounting right before exit_notify(). After that we shouldn't track the exiting task any further. Signed-off-by: Frederic Weisbecker <frede...@kernel.org> Cc: Yauheni Kaliuta <yauheni.kali...@redhat.com> Cc: Thomas Gleixner <t...@linutronix.de> Cc: Rik van Riel <r...@redhat.com> Cc: Peter Zijlstra <pet...@infradead.org> Cc: Wanpeng Li <wanpen...@tencent.com> Cc: Ingo Molnar <mi...@kernel.org> --- include/linux/sched.h | 2 ++ include/linux/vtime.h | 2 ++ kernel/exit.c | 1 + kernel/sched/cputime.c | 56 ++++++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 52 insertions(+), 9 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d458d65..27e0544 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -265,6 +265,8 @@ struct task_cputime { enum vtime_state { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, + /* Task has passed exit_notify() */ + VTIME_DEAD, /* Task is idle */ VTIME_IDLE, /* Task runs in kernelspace in a CPU with VTIME active: */ diff --git a/include/linux/vtime.h b/include/linux/vtime.h index d9160ab..8350a0b 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -73,12 +73,14 @@ extern void vtime_user_exit(struct task_struct *tsk); extern void vtime_guest_enter(struct task_struct *tsk); extern void vtime_guest_exit(struct task_struct *tsk); extern void vtime_init_idle(struct task_struct *tsk, int cpu); +extern void vtime_exit_task(struct task_struct *tsk); #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ static inline void vtime_user_enter(struct task_struct *tsk) { } static inline void vtime_user_exit(struct task_struct *tsk) { } static inline void vtime_guest_enter(struct task_struct *tsk) { } static inline void vtime_guest_exit(struct task_struct *tsk) { } static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } +static inline void vtime_exit_task(struct task_struct *tsk) { } #endif #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE diff --git a/kernel/exit.c b/kernel/exit.c index 0e21e6d..cae3fe9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -883,6 +883,7 @@ void __noreturn do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); + vtime_exit_task(tsk); exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f64afd7..a0c3a82 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -702,7 +702,7 @@ static u64 get_vtime_delta(struct vtime *vtime) * errors from causing elapsed vtime to go negative. */ other = account_other_time(delta); - WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); + WARN_ON_ONCE(vtime->state < VTIME_IDLE); vtime->starttime += delta; return delta - other; @@ -813,17 +813,31 @@ void vtime_task_switch_generic(struct task_struct *prev) { struct vtime *vtime = &prev->vtime; - write_seqcount_begin(&vtime->seqcount); - if (vtime->state == VTIME_IDLE) - vtime_account_idle(prev); - else - __vtime_account_kernel(prev, vtime); - vtime->state = VTIME_INACTIVE; - vtime->cpu = -1; - write_seqcount_end(&vtime->seqcount); + /* + * Flush the prev task vtime, unless it has passed + * vtime_exit_task(), in which case there is nothing + * left to account. + */ + if (vtime->state != VTIME_DEAD) { + write_seqcount_begin(&vtime->seqcount); + if (vtime->state == VTIME_IDLE) + vtime_account_idle(prev); + else + __vtime_account_kernel(prev, vtime); + vtime->state = VTIME_INACTIVE; + vtime->cpu = -1; + write_seqcount_end(&vtime->seqcount); + } vtime = ¤t->vtime; + /* + * Ignore the next task if it has been preempted after + * vtime_exit_task(). + */ + if (vtime->state == VTIME_DEAD) + return; + write_seqcount_begin(&vtime->seqcount); if (is_idle_task(current)) vtime->state = VTIME_IDLE; @@ -850,6 +864,30 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_restore(flags); } +/* + * This is the final settlement point after which we don't account + * anymore vtime for this task. + */ +void vtime_exit_task(struct task_struct *t) +{ + struct vtime *vtime = &t->vtime; + unsigned long flags; + + local_irq_save(flags); + write_seqcount_begin(&vtime->seqcount); + /* + * A task that has never run on a nohz_full CPU hasn't + * been tracked by vtime. Thus it's in VTIME_INACTIVE + * state. Nothing to account for it. + */ + if (vtime->state != VTIME_INACTIVE) + vtime_account_system(t, vtime); + vtime->state = VTIME_DEAD; + vtime->cpu = -1; + write_seqcount_end(&vtime->seqcount); + local_irq_restore(flags); +} + u64 task_gtime(struct task_struct *t) { struct vtime *vtime = &t->vtime; -- 2.7.4