Re: [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability

Frederic Weisbecker Wed, 15 Apr 2015 06:27:08 -0700

On Tue, Apr 14, 2015 at 04:09:45PM -0700, Jason Low wrote:
> While running a database workload, we found a scalability issue with itimers.
> 
> Much of the problem was caused by the thread_group_cputimer spinlock.
> Each time we account for group system/user time, we need to obtain a
> thread_group_cputimer's spinlock to update the timers. On larger systems
> (such as a 16 socket machine), this caused more than 30% of total time
> spent trying to obtain this kernel lock to update these group timer stats.
> 
> This patch converts the timers to 64 bit atomic variables and use
> atomic add to update them without a lock. With this patch, the percent
> of total time spent updating thread group cputimer timers was reduced
> from 30% down to less than 1%. 
> 
> Note: On 32 bit systems using the generic 64 bit atomics, this causes 
> sample_group_cputimer() to take locks 3 times instead of just 1 time.
> However, we tested this patch on a 32 bit system ARM system using the
> generic atomics and did not find the overhead to be much of an issue.
> An explanation for why this isn't an issue is that 32 bit systems usually
> have small numbers of CPUs, and cacheline contention from extra spinlocks
> called periodically is not really apparent on smaller systems.
> 
> Signed-off-by: Jason Low <[email protected]>
> ---
>  include/linux/init_task.h      |    7 +++--
>  include/linux/sched.h          |   10 ++-----
>  kernel/fork.c                  |    3 --
>  kernel/sched/stats.h           |   12 ++-------
>  kernel/time/posix-cpu-timers.c |   48 
> ++++++++++++++++++++--------------------
>  5 files changed, 34 insertions(+), 46 deletions(-)
> 
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 696d223..7b9d8b5 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
>       .cpu_timers     = INIT_CPU_TIMERS(sig.cpu_timers),              \
>       .rlim           = INIT_RLIMITS,                                 \
>       .cputimer       = {                                             \
> -             .cputime = INIT_CPUTIME,                                \
> -             .running = 0,                                           \
> -             .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),    \
> +             .utime            = ATOMIC64_INIT(0),                   \
> +             .stime            = ATOMIC64_INIT(0),                   \
> +             .sum_exec_runtime = ATOMIC64_INIT(0),                   \
> +             .running          = 0                                   \
>       },                                                              \
>       .cred_guard_mutex =                                             \
>                __MUTEX_INITIALIZER(sig.cred_guard_mutex),             \
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 379fb3b..a5bb23b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -592,9 +592,10 @@ struct task_cputime {
>   * used for thread group CPU timer calculations.
>   */
>  struct thread_group_cputimer {
> -     struct task_cputime cputime;
> +     atomic64_t utime;
> +     atomic64_t stime;
> +     atomic64_t sum_exec_runtime;
>       int running;
> -     raw_spinlock_t lock;
>  };
>  
>  #include <linux/rwsem.h>
> @@ -2952,11 +2953,6 @@ static __always_inline bool need_resched(void)
>  void thread_group_cputime(struct task_struct *tsk, struct task_cputime 
> *times);
>  void thread_group_cputimer(struct task_struct *tsk, struct task_cputime 
> *times);
>  
> -static inline void thread_group_cputime_init(struct signal_struct *sig)
> -{
> -     raw_spin_lock_init(&sig->cputimer.lock);
> -}
> -
>  /*
>   * Reevaluate whether the task has signals pending delivery.
>   * Wake the task if so.
> diff --git a/kernel/fork.c b/kernel/fork.c
> index d96a0ca..da8e6dd 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1042,9 +1042,6 @@ static void posix_cpu_timers_init_group(struct 
> signal_struct *sig)
>  {
>       unsigned long cpu_limit;
>  
> -     /* Thread group counters. */
> -     thread_group_cputime_init(sig);
> -
>       cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
>       if (cpu_limit != RLIM_INFINITY) {
>               sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index 4ab7043..adda94e 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -215,9 +215,7 @@ static inline void account_group_user_time(struct 
> task_struct *tsk,
>       if (!cputimer_running(tsk))
>               return;
>  
> -     raw_spin_lock(&cputimer->lock);
> -     cputimer->cputime.utime += cputime;
> -     raw_spin_unlock(&cputimer->lock);
> +     atomic64_add(cputime, &cputimer->utime);
>  }
>  
>  /**
> @@ -238,9 +236,7 @@ static inline void account_group_system_time(struct 
> task_struct *tsk,
>       if (!cputimer_running(tsk))
>               return;
>  
> -     raw_spin_lock(&cputimer->lock);
> -     cputimer->cputime.stime += cputime;
> -     raw_spin_unlock(&cputimer->lock);
> +     atomic64_add(cputime, &cputimer->stime);
>  }
>  
>  /**
> @@ -261,7 +257,5 @@ static inline void account_group_exec_runtime(struct 
> task_struct *tsk,
>       if (!cputimer_running(tsk))
>               return;
>  
> -     raw_spin_lock(&cputimer->lock);
> -     cputimer->cputime.sum_exec_runtime += ns;
> -     raw_spin_unlock(&cputimer->lock);
> +     atomic64_add(ns, &cputimer->sum_exec_runtime);
>  }
> diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
> index e072d98..7e96082 100644
> --- a/kernel/time/posix-cpu-timers.c
> +++ b/kernel/time/posix-cpu-timers.c
> @@ -196,39 +196,44 @@ static int cpu_clock_sample(const clockid_t 
> which_clock, struct task_struct *p,
>       return 0;
>  }
>  
> -static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
> +static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct 
> task_cputime *sum)
>  {
> -     if (b->utime > a->utime)
> -             a->utime = b->utime;
> +     if (sum->utime > atomic64_read(&cputimer->utime))
> +             atomic64_set(&cputimer->utime, sum->utime);
>  
> -     if (b->stime > a->stime)
> -             a->stime = b->stime;
> +     if (sum->stime > atomic64_read(&cputimer->stime))
> +             atomic64_set(&cputimer->stime, sum->stime);
>  
> -     if (b->sum_exec_runtime > a->sum_exec_runtime)
> -             a->sum_exec_runtime = b->sum_exec_runtime;
> +     if (sum->sum_exec_runtime > atomic64_read(&cputimer->sum_exec_runtime))
> +             atomic64_set(&cputimer->sum_exec_runtime, 
> sum->sum_exec_runtime);
> +}
> +
> +/* Sample thread_group_cputimer values in "cputimer", copy results to 
> "times" */
> +static inline void sample_group_cputimer(struct task_cputime *times,
> +                                       struct thread_group_cputimer 
> *cputimer)
> +{
> +     times->utime = atomic64_read(&cputimer->utime);
> +     times->stime = atomic64_read(&cputimer->stime);
> +     times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
>  }
>  
>  void thread_group_cputimer(struct task_struct *tsk, struct task_cputime 
> *times)
>  {
>       struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
>       struct task_cputime sum;
> -     unsigned long flags;
>  
>       if (!cputimer->running) {
>               /*
>                * The POSIX timer interface allows for absolute time expiry
>                * values through the TIMER_ABSTIME flag, therefore we have
> -              * to synchronize the timer to the clock every time we start
> -              * it.
> +              * to synchronize the timer to the clock every time we start it.
>                */
>               thread_group_cputime(tsk, &sum);
> -             raw_spin_lock_irqsave(&cputimer->lock, flags);
> -             cputimer->running = 1;
> -             update_gt_cputime(&cputimer->cputime, &sum);
> -     } else
> -             raw_spin_lock_irqsave(&cputimer->lock, flags);
> -     *times = cputimer->cputime;
> -     raw_spin_unlock_irqrestore(&cputimer->lock, flags);
> +             update_gt_cputime(cputimer, &sum);
> +             /* Start 'running' after update_gt_cputime() */
> +             smp_store_release(&cputimer->running, 1);


This barrier should be mirrored somewhere but I can't see where in this patch.
Maybe in another one in the series. Or maybe there is already a barrier in the
existing code that I'm missing. I would expect to see it in 
account_group_*_time().
In any case, there should be comment about what it mirrors.

> +     }
> +     sample_group_cputimer(times, cputimer);
>  }
>  
>  /*
> @@ -885,11 +890,8 @@ static void check_thread_timers(struct task_struct *tsk,
>  static void stop_process_timers(struct signal_struct *sig)
>  {
>       struct thread_group_cputimer *cputimer = &sig->cputimer;
> -     unsigned long flags;
>  
> -     raw_spin_lock_irqsave(&cputimer->lock, flags);
> -     cputimer->running = 0;
> -     raw_spin_unlock_irqrestore(&cputimer->lock, flags);
> +     WRITE_ONCE(cputimer->running, 0);
>  }
>  
>  static u32 onecputick;
> @@ -1114,9 +1116,7 @@ static inline int fastpath_timer_check(struct 
> task_struct *tsk)
>       if (sig->cputimer.running) {
>               struct task_cputime group_sample;
>  
> -             raw_spin_lock(&sig->cputimer.lock);
> -             group_sample = sig->cputimer.cputime;
> -             raw_spin_unlock(&sig->cputimer.lock);
> +             sample_group_cputimer(&group_sample, &sig->cputimer);
>  
>               if (task_cputime_expired(&group_sample, &sig->cputime_expires))
>                       return 1;
> -- 
> 1.7.2.5
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability

Reply via email to