[PATCH] sched/cputime: make scale_stime() more precise

Oleg Nesterov Thu, 18 Jul 2019 06:16:15 -0700

People report that utime and stime from /proc/<pid>/stat become very wrong
when the numbers are big enough. In particular, the monitored application
can run all the time in user-space but only stime grows.


This is because scale_stime() is very inaccurate. It tries to minimize the
relative error, but the absolute error can be huge.

Andrew wrote the test-case:

        int main(int argc, char **argv)
        {
            struct task_cputime c;
            struct prev_cputime p;
            u64 st, pst, cst;
            u64 ut, put, cut;
            u64 x;
            int i = -1; // one step not printed

            if (argc != 2)
            {
                printf("usage: %s <start_in_seconds>\n", argv[0]);
                return 1;
            }
            x = strtoull(argv[1], NULL, 0) * SEC;
            printf("start=%lld\n", x);

            p.stime = 0;
            p.utime = 0;

            while (i++ < NSTEPS)
            {
                x += STEP;
                c.stime = x;
                c.utime = x;
                c.sum_exec_runtime = x + x;
                pst = cputime_to_clock_t(p.stime);
                put = cputime_to_clock_t(p.utime);
                cputime_adjust(&c, &p, &ut, &st);
                cst = cputime_to_clock_t(st);
                cut = cputime_to_clock_t(ut);
                if (i)
                    printf("ut(diff)/st(diff): %20lld (%4lld)  %20lld 
(%4lld)\n",
                        cut, cut - put, cst, cst - pst);
            }
        }

For example,

        $ ./stime 300000
        start=300000000000000
        ut(diff)/st(diff):            299994875 (   0)             300009124 
(2000)
        ut(diff)/st(diff):            299994875 (   0)             300011124 
(2000)
        ut(diff)/st(diff):            299994875 (   0)             300013124 
(2000)
        ut(diff)/st(diff):            299994875 (   0)             300015124 
(2000)
        ut(diff)/st(diff):            299994875 (   0)             300017124 
(2000)
        ut(diff)/st(diff):            299994875 (   0)             300019124 
(2000)
        ut(diff)/st(diff):            299994875 (   0)             300021124 
(2000)
        ut(diff)/st(diff):            299994875 (   0)             300023124 
(2000)
        ut(diff)/st(diff):            299994875 (   0)             300025124 
(2000)
        ut(diff)/st(diff):            299994875 (   0)             300027124 
(2000)
        ut(diff)/st(diff):            299994875 (   0)             300029124 
(2000)
        ut(diff)/st(diff):            299996875 (2000)             300029124 (  
 0)
        ut(diff)/st(diff):            299998875 (2000)             300029124 (  
 0)
        ut(diff)/st(diff):            300000875 (2000)             300029124 (  
 0)
        ut(diff)/st(diff):            300002875 (2000)             300029124 (  
 0)
        ut(diff)/st(diff):            300004875 (2000)             300029124 (  
 0)
        ut(diff)/st(diff):            300006875 (2000)             300029124 (  
 0)
        ut(diff)/st(diff):            300008875 (2000)             300029124 (  
 0)
        ut(diff)/st(diff):            300010875 (2000)             300029124 (  
 0)
        ut(diff)/st(diff):            300012055 (1180)             300029944 ( 
820)
        ut(diff)/st(diff):            300012055 (   0)             300031944 
(2000)
        ut(diff)/st(diff):            300012055 (   0)             300033944 
(2000)
        ut(diff)/st(diff):            300012055 (   0)             300035944 
(2000)
        ut(diff)/st(diff):            300012055 (   0)             300037944 
(2000)

shows the problem even when sum_exec_runtime is not that big: 300000 secs.

The new implementation of scale_stime() does the additional div64_u64_rem()
in a loop but see the comment, as long it is used by cputime_adjust() this
can happen only once.

Reported-by: Andrew Fox <[email protected]>
Signed-off-by: Oleg Nesterov <[email protected]>
---
 kernel/sched/cputime.c | 66 ++++++++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2305ce8..ad055a3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -525,47 +525,55 @@ void account_idle_ticks(unsigned long ticks)
 }
 
 /*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
+ * Perform (stime * rtime) / total, but avoid multiplication overflow
+ * by losing precision when the numbers are big.
+ *
+ * NOTE! currently the only user is cputime_adjust() and thus
+ *
+ *     stime < total && rtime > total
+ *
+ * this means that the end result is always precise and the additional
+ * div64_u64_rem() inside the main loop is called at most once.
  */
 static u64 scale_stime(u64 stime, u64 rtime, u64 total)
 {
-       u64 scaled;
+       u64 res = 0, div, rem;
 
-       for (;;) {
-               /* Make sure "rtime" is the bigger of stime/rtime */
+       /* can stime * rtime overflow ? */
+       while (ilog2(stime) + ilog2(rtime) > 62) {
                if (stime > rtime)
                        swap(rtime, stime);
 
-               /* Make sure 'total' fits in 32 bits */
-               if (total >> 32)
-                       goto drop_precision;
-
-               /* Does rtime (and thus stime) fit in 32 bits? */
-               if (!(rtime >> 32))
-                       break;
-
-               /* Can we just balance rtime/stime rather than dropping bits? */
-               if (stime >> 31)
-                       goto drop_precision;
-
-               /* We can grow stime and shrink rtime and try to make them both 
fit */
-               stime <<= 1;
-               rtime >>= 1;
-               continue;
+               if (rtime >= total) {
+                       /*
+                        * (rtime * stime) / total is equal to
+                        *
+                        *      (rtime / total) * stime +
+                        *      (rtime % total) * stime / total
+                        *
+                        * if nothing overflows. Can the 1st multiplication
+                        * overflow? Yes, but we do not care: this can only
+                        * happen if the end result can't fit in u64 anyway.
+                        *
+                        * So the code below does
+                        *
+                        *      res += (rtime / total) * stime;
+                        *      rtime = rtime % total;
+                        */
+                       div = div64_u64_rem(rtime, total, &rem);
+                       res += div * stime;
+                       rtime = rem;
+                       continue;
+               }
 
-drop_precision:
-               /* We drop from rtime, it has more bits than stime */
+               /* drop precision */
                rtime >>= 1;
                total >>= 1;
+               if (!total)
+                       return res;
        }
 
-       /*
-        * Make sure gcc understands that this is a 32x32->64 multiply,
-        * followed by a 64/32->64 divide.
-        */
-       scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-       return scaled;
+       return res + div64_u64(stime * rtime, total);
 }
 
 /*
-- 
2.5.0

[PATCH] sched/cputime: make scale_stime() more precise

Reply via email to