Scaling cputime cause problems, bunch of them was fixed, but still is possible
to hit multiplication overflow issue, which make {u,s}time values incorrect.
This problem has no good solution in kernel.

This patch remove scaling code and export raw values of {u,t}ime . Procps
programs can use newly introduced sum_exec_runtime to find out precisely
calculated process cpu time and scale utime/stime values accordingly.

Unfortunately times(2) syscall has no such option.

This change affect kernels compiled without CONFIG_VIRT_CPU_ACCOUNTING_*.

Signed-off-by: Stanislaw Gruszka <sgrus...@redhat.com>
---
 fs/proc/array.c        |    4 +-
 include/linux/sched.h  |   20 --------
 kernel/exit.c          |    4 +-
 kernel/fork.c          |    3 -
 kernel/sched/cputime.c |  117 +-----------------------------------------------
 kernel/sys.c           |    6 +-
 6 files changed, 8 insertions(+), 146 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 1444dc5..5feadc4 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -459,7 +459,7 @@ static int do_task_stat(struct seq_file *m, struct 
pid_namespace *ns,
 
                        min_flt += sig->min_flt;
                        maj_flt += sig->maj_flt;
-                       thread_group_cputime_adjusted(task, &cputime);
+                       thread_group_cputime(task, &cputime);
                        utime = cputime.utime;
                        stime = cputime.stime;
                        sum_exec_runtime = cputime.sum_exec_runtime;
@@ -478,7 +478,7 @@ static int do_task_stat(struct seq_file *m, struct 
pid_namespace *ns,
        if (!whole) {
                min_flt = task->min_flt;
                maj_flt = task->maj_flt;
-               task_cputime_adjusted(task, &utime, &stime);
+               task_cputime(task, &utime, &stime);
                sum_exec_runtime = task->se.sum_exec_runtime;
                gtime = task_gtime(task);
        }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c25772d..23c8ac3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -397,18 +397,6 @@ struct cpu_itimer {
 };
 
 /**
- * struct cputime - snaphsot of system and user cputime
- * @utime: time spent in user mode
- * @stime: time spent in system mode
- *
- * Gathers a generic snapshot of user and system time.
- */
-struct cputime {
-       cputime_t utime;
-       cputime_t stime;
-};
-
-/**
  * struct task_cputime - collected CPU time counts
  * @utime:             time spent in user mode, in &cputime_t units
  * @stime:             time spent in kernel mode, in &cputime_t units
@@ -558,9 +546,6 @@ struct signal_struct {
        cputime_t utime, stime, cutime, cstime;
        cputime_t gtime;
        cputime_t cgtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-       struct cputime prev_cputime;
-#endif
        unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
        unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
        unsigned long inblock, oublock, cinblock, coublock;
@@ -1161,9 +1146,6 @@ struct task_struct {
 
        cputime_t utime, stime, utimescaled, stimescaled;
        cputime_t gtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-       struct cputime prev_cputime;
-#endif
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        seqlock_t vtime_seqlock;
        unsigned long long vtime_snap;
@@ -1597,8 +1579,6 @@ static inline cputime_t task_gtime(struct task_struct *t)
        return t->gtime;
 }
 #endif
-extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, 
cputime_t *st);
-extern void thread_group_cputime_adjusted(struct task_struct *p, struct 
task_cputime *ct);
 
 /*
  * Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index fb158f1..b6bd7ae 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1084,11 +1084,11 @@ static int wait_task_zombie(struct wait_opts *wo, 
struct task_struct *p)
                 * as other threads in the parent group can be right
                 * here reaping other children at the same time.
                 *
-                * We use thread_group_cputime_adjusted() to get times for the 
thread
+                * We use thread_group_cputime() to get times for the thread
                 * group, which consolidates times for all threads in the
                 * group including the group leader.
                 */
-               thread_group_cputime_adjusted(p, &tg_cputime);
+               thread_group_cputime(p, &tg_cputime);
                spin_lock_irq(&p->real_parent->sighand->siglock);
                psig = p->real_parent->signal;
                sig = p->signal;
diff --git a/kernel/fork.c b/kernel/fork.c
index 339f60d..2ae1706 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1233,9 +1233,6 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
 
        p->utime = p->stime = p->gtime = 0;
        p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-       p->prev_cputime.utime = p->prev_cputime.stime = 0;
-#endif
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        seqlock_init(&p->vtime_seqlock);
        p->vtime_snap = 0;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a600f7f..23df74b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -448,19 +448,7 @@ EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
 
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-       *ut = p->utime;
-       *st = p->stime;
-}
-
-void thread_group_cputime_adjusted(struct task_struct *p, struct task_cputime 
*cputime)
-{
-       thread_group_cputime(p, cputime);
-}
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /*
  * Account a single tick of cpu time.
  * @p: the process that the cpu time gets accounted to
@@ -516,109 +504,6 @@ void account_idle_ticks(unsigned long ticks)
        account_idle_time(jiffies_to_cputime(ticks));
 }
 
-/*
- * Perform (stime * rtime) / total with reduced chances
- * of multiplication overflows by using smaller factors
- * like quotient and remainders of divisions between
- * rtime and total.
- */
-static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
-{
-       u64 rem, res, scaled;
-
-       if (rtime >= total) {
-               /*
-                * Scale up to rtime / total then add
-                * the remainder scaled to stime / total.
-                */
-               res = div64_u64_rem(rtime, total, &rem);
-               scaled = stime * res;
-               scaled += div64_u64(stime * rem, total);
-       } else {
-               /*
-                * Same in reverse: scale down to total / rtime
-                * then substract that result scaled to
-                * to the remaining part.
-                */
-               res = div64_u64_rem(total, rtime, &rem);
-               scaled = div64_u64(stime, res);
-               scaled -= div64_u64(scaled * rem, total);
-       }
-
-       return (__force cputime_t) scaled;
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
- */
-static void cputime_adjust(struct task_cputime *curr,
-                          struct cputime *prev,
-                          cputime_t *ut, cputime_t *st)
-{
-       cputime_t rtime, stime, total;
-
-       if (vtime_accounting_enabled()) {
-               *ut = curr->utime;
-               *st = curr->stime;
-               return;
-       }
-
-       stime = curr->stime;
-       total = stime + curr->utime;
-
-       /*
-        * Tick based cputime accounting depend on random scheduling
-        * timeslices of a task to be interrupted or not by the timer.
-        * Depending on these circumstances, the number of these interrupts
-        * may be over or under-optimistic, matching the real user and system
-        * cputime with a variable precision.
-        *
-        * Fix this by scaling these tick based values against the total
-        * runtime accounted by the CFS scheduler.
-        */
-       rtime = nsecs_to_cputime(curr->sum_exec_runtime);
-
-       if (!rtime) {
-               stime = 0;
-       } else if (!total) {
-               stime = rtime;
-       } else {
-               stime = scale_stime((__force u64)stime,
-                                   (__force u64)rtime, (__force u64)total);
-       }
-
-       /*
-        * If the tick based count grows faster than the scheduler one,
-        * the result of the scaling may go backward.
-        * Let's enforce monotonicity.
-        */
-       prev->stime = max(prev->stime, stime);
-       prev->utime = max(prev->utime, rtime - prev->stime);
-
-       *ut = prev->utime;
-       *st = prev->stime;
-}
-
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-       struct task_cputime cputime = {
-               .sum_exec_runtime = p->se.sum_exec_runtime,
-       };
-
-       task_cputime(p, &cputime.utime, &cputime.stime);
-       cputime_adjust(&cputime, &p->prev_cputime, ut, st);
-}
-
-/*
- * Must be called with siglock held.
- */
-void thread_group_cputime_adjusted(struct task_struct *p, struct task_cputime 
*cputime)
-{
-       thread_group_cputime(p, cputime);
-       cputime_adjust(cputime, &p->signal->prev_cputime,
-                      &cputime->utime, &cputime->stime);
-}
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
diff --git a/kernel/sys.c b/kernel/sys.c
index 2f555c1..00f143e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1049,7 +1049,7 @@ void do_sys_times(struct tms *tms)
        struct task_cputime tg_cputime;
 
        spin_lock_irq(&current->sighand->siglock);
-       thread_group_cputime_adjusted(current, &tg_cputime);
+       thread_group_cputime(current, &tg_cputime);
        cutime = current->signal->cutime;
        cstime = current->signal->cstime;
        spin_unlock_irq(&current->sighand->siglock);
@@ -1708,7 +1708,7 @@ static void k_getrusage(struct task_struct *p, int who, 
struct rusage *r)
        utime = stime = 0;
 
        if (who == RUSAGE_THREAD) {
-               task_cputime_adjusted(current, &utime, &stime);
+               task_cputime(current, &utime, &stime);
                accumulate_thread_rusage(p, r);
                maxrss = p->signal->maxrss;
                goto out;
@@ -1734,7 +1734,7 @@ static void k_getrusage(struct task_struct *p, int who, 
struct rusage *r)
                                break;
 
                case RUSAGE_SELF:
-                       thread_group_cputime_adjusted(p, &tg_cputime);
+                       thread_group_cputime(p, &tg_cputime);
                        utime += tg_cputime.utime;
                        stime += tg_cputime.stime;
                        r->ru_nvcsw += p->signal->nvcsw;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to