[RFC PATCH v3 06/12] lib: vdso: __iter_div_u64_rem() is suboptimal for 32 bit time

Christophe Leroy Mon, 13 Jan 2020 09:22:35 -0800

Using __iter_div_ulong_rem() is suboptimal on 32 bits.
Nanoseconds are only 32 bits, and VDSO data is updated every 10ms
so nsec will never overflow 32 bits.


Add an equivalent of __iter_div_u64_rem() but based
on unsigned long to better fit with 32 bits arches.

Before:
gettimeofday:    vdso: 1078 nsec/call
clock-gettime-monotonic-raw:    vdso: 1317 nsec/call
clock-gettime-monotonic:    vdso: 1255 nsec/call

After:
gettimeofday:    vdso: 1032 nsec/call
clock-gettime-monotonic-raw:    vdso: 1312 nsec/call
clock-gettime-monotonic:    vdso: 1243 nsec/call
Signed-off-by: Christophe Leroy <christophe.le...@c-s.fr>
---
 lib/vdso/gettimeofday.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index decd3f2b37af..da15a8842825 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -38,12 +38,32 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 
mult)
 }
 #endif
 
+static __always_inline u32
+__iter_div_ulong_rem(unsigned long dividend, u32 divisor, unsigned long 
*remainder)
+{
+       u32 ret = 0;
+
+       while (dividend >= divisor) {
+               /* The following asm() prevents the compiler from
+                  optimising this loop into a modulo operation.  */
+               asm("" : "+rm"(dividend));
+
+               dividend -= divisor;
+               ret++;
+       }
+
+       *remainder = dividend;
+
+       return ret;
+}
+
 static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
                                   struct __kernel_timespec *ts)
 {
        const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
        u64 cycles, last, sec, ns;
        u32 seq;
+       unsigned long nsec;
 
        do {
                seq = vdso_read_begin(vd);
@@ -54,7 +74,7 @@ static __always_inline int do_hres(const struct vdso_data 
*vd, clockid_t clk,
                        return -1;
 
                ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
-               ns >>= vd->shift;
+               nsec = ns >> vd->shift;
                sec = vdso_ts->sec;
        } while (unlikely(vdso_read_retry(vd, seq)));
 
@@ -62,8 +82,8 @@ static __always_inline int do_hres(const struct vdso_data 
*vd, clockid_t clk,
         * Do this outside the loop: a race inside the loop could result
         * in __iter_div_u64_rem() being extremely slow.
         */
-       ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
-       ts->tv_nsec = ns;
+       ts->tv_sec = sec + __iter_div_ulong_rem(nsec, NSEC_PER_SEC, &nsec);
+       ts->tv_nsec = nsec;
 
        return 0;
 }
-- 
2.13.3

[RFC PATCH v3 06/12] lib: vdso: __iter_div_u64_rem() is suboptimal for 32 bit time

Reply via email to