* Julien Desfossez ([email protected]) wrote:
> This new option to clock_gettime allows the user to retreive a precise
> timestamp using the LTTng infrastructure. If the TSC is not synchronized
> across the CPUs (validated at boot time) the function returns an error
> to the user in order to make him understand he won't be able to
> synchronize automatically kernel and userspace traces and he need to
> fallback on an other clocksource.

I'm very happy to see the ball rolling! Comments below,

> 
> The main difference with using the TSC clocksource directly is that the
> time starts at machine boot and not at Linux boot which make it possible
> to correlate user and kernelspace events.
> On 64 bits architecture we only use the nsec field of the timespec to
> encode the time in nanoseconds, the sec field is always 0.
> On 32 bits we have to make the division and split the sec and nsec.
> 
> I am not particularly proud of the way the syscall is implemented (in
> kernel/posix-timers.c), I would be really glad if anyone has an idea to
> clean that up.
> 
> Some benchmarks (with 20000000 iterations reading the tsc before and after 
> each
> call on an i7 920):
> 
> 64 bits with vDSO
> average cycles for clock_realtime: 101
> average cycles for clock_monotonic: 104
> average cycles for clock_trace: 66
> 
> 64 bits without vDSO (using syscall)
> average cycles for clock_realtime: 240
> average cycles for clock_monotonic: 256
> average cycles for clock_trace: 231
> 
> 32 bits with AND without vDSO
> average cycles for clock_realtime: 649
> average cycles for clock_monotonic: 661
> average cycles for clock_trace: 630

You always fallback on syscall on 32-bit ?

> 
> Signed-off-by: Julien Desfossez <[email protected]>
> ---
>  arch/x86/include/asm/trace-clock.h |    1 +
>  arch/x86/include/asm/vgtod.h       |    2 +
>  arch/x86/kernel/vsyscall_64.c      |    5 ++++
>  arch/x86/vdso/vclock_gettime.c     |   44 
> ++++++++++++++++++++++++++++++++++++
>  include/linux/time.h               |    1 +
>  kernel/posix-timers.c              |   27 ++++++++++++++++++++++
>  6 files changed, 80 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/x86/include/asm/trace-clock.h 
> b/arch/x86/include/asm/trace-clock.h
> index 01bc2f5..c1fd160 100644
> --- a/arch/x86/include/asm/trace-clock.h
> +++ b/arch/x86/include/asm/trace-clock.h
> @@ -14,6 +14,7 @@
>  #include <asm/system.h>
>  #include <asm/processor.h>
>  #include <asm/atomic.h>
> +#include <asm/vgtod.h>
>  
>  /* Minimum duration of a probe, in cycles */
>  #define TRACE_CLOCK_MIN_PROBE_DURATION 200
> diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
> index 3d61e20..7367f1c 100644
> --- a/arch/x86/include/asm/vgtod.h
> +++ b/arch/x86/include/asm/vgtod.h
> @@ -12,6 +12,8 @@ struct vsyscall_gtod_data {
>       u32             wall_time_nsec;
>  
>       int             sysctl_enabled;
> +     int             trace_clock_is_sync;
> +     unsigned long           cpu_khz;
>       struct timezone sys_tz;
>       struct { /* extract of a clocksource struct */
>               cycle_t (*vread)(void);
> diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
> index dcbb28c..ce750b5 100644
> --- a/arch/x86/kernel/vsyscall_64.c
> +++ b/arch/x86/kernel/vsyscall_64.c
> @@ -44,6 +44,8 @@
>  #include <asm/desc.h>
>  #include <asm/topology.h>
>  #include <asm/vgtod.h>
> +#include <asm/trace-clock.h>
> +#include <asm/timer.h>
>  
>  #define __vsyscall(nr) \
>               __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
> @@ -61,6 +63,7 @@ struct vsyscall_gtod_data __vsyscall_gtod_data 
> __section_vsyscall_gtod_data =
>  {
>       .lock = SEQLOCK_UNLOCKED,
>       .sysctl_enabled = 1,
> +     .trace_clock_is_sync = 0,
>  };
>  
>  void update_vsyscall_tz(void)
> @@ -89,6 +92,8 @@ void update_vsyscall(struct timespec *wall_time, struct 
> timespec *wtm,
>       vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
>       vsyscall_gtod_data.wall_to_monotonic = *wtm;
>       vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
> +     vsyscall_gtod_data.cpu_khz = cpu_khz >> CYC2NS_SCALE_FACTOR;
> +     vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
>       write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
>  }
>  
> diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> index ee55754..81bc9c7 100644
> --- a/arch/x86/vdso/vclock_gettime.c
> +++ b/arch/x86/vdso/vclock_gettime.c
> @@ -22,6 +22,8 @@
>  #include <asm/hpet.h>
>  #include <asm/unistd.h>
>  #include <asm/io.h>
> +#include <asm/trace-clock.h>
> +#include <asm/timer.h>
>  #include "vextern.h"
>  
>  #define gtod vdso_vsyscall_gtod_data
> @@ -69,6 +71,7 @@ vset_normalized_timespec(struct timespec *ts, long sec, 
> long nsec)
>               --sec;
>       }
>       ts->tv_sec = sec;
> +

extra whitespace to remove.

>       ts->tv_nsec = nsec;
>  }
>  
> @@ -111,6 +114,41 @@ notrace static noinline int do_monotonic_coarse(struct 
> timespec *ts)
>       return 0;
>  }
>  
> +#ifdef CONFIG_X86
> +notrace static noinline int do_trace_clock(struct timespec *ts)
> +{
> +     cycle_t cycles;
> +     unsigned long tmp;
> +
> +     if (!gtod->trace_clock_is_sync)
> +             return -EPERM;
> +
> +     /* Copy of the version in kernel/tsc.c which we cannot directly access
> +      *
> +      * Surround the RDTSC by barriers, to make sure it's not
> +      * speculated to outside the seqlock critical section and
> +      * does not cause time warps:
> +      */
> +     rdtsc_barrier();
> +     cycles = (cycle_t)vget_cycles();
> +     rdtsc_barrier();
> +
> +     /* On 64 bits, the nsec field is enough to store the whole information,
> +      * on 32 bits we need to split it in two fields
> +      */
> +     tmp = cycles * (USEC_PER_SEC / gtod->cpu_khz) >> CYC2NS_SCALE_FACTOR;

I'd go for creation of:

union aliased_ts {
  struct timespec ts;
  u64 count;
};

and then:

union aliased_ts *ats = ts;

ats->count = tmp;

both for x86 32 and 64.

Then you cast the returned struct timespec (in user-space) into the same aliased
structure.

Does it seem clean enough ?

The goal is to trash the code below, especially the division.

> +#ifdef CONFIG_X86_64
> +     ts->tv_nsec = tmp;
> +     ts->tv_sec = 0;
> +#elif defined CONFIG_X86_32
> +     ts->tv_nsec = do_div(tmp, NSEC_PER_SEC);
> +     ts->tv_sec = tmp;
> +#endif
> +
> +     return 0;
> +}
> +#endif /* CONFIG_X86 */
> +
>  notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
>  {
>       if (likely(gtod->sysctl_enabled))
> @@ -127,6 +165,12 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct 
> timespec *ts)
>                       return do_realtime_coarse(ts);
>               case CLOCK_MONOTONIC_COARSE:
>                       return do_monotonic_coarse(ts);

OK, so on kernels not supporting our clock ID, we can detect this because
"-EINVAL" is returned. Yep, looks good.

> +#ifdef CONFIG_X86
> +             case CLOCK_TRACE:
> +                     return do_trace_clock(ts);
> +#endif
> +             default:
> +                     return -EINVAL;
>               }
>       return vdso_fallback_gettime(clock, ts);
>  }
> diff --git a/include/linux/time.h b/include/linux/time.h
> index 9f15ac7..9e6d9e2 100644
> --- a/include/linux/time.h
> +++ b/include/linux/time.h
> @@ -290,6 +290,7 @@ struct itimerval {
>  #define CLOCK_MONOTONIC_RAW          4
>  #define CLOCK_REALTIME_COARSE                5
>  #define CLOCK_MONOTONIC_COARSE               6
> +#define CLOCK_TRACE                                  32
>  
>  /*
>   * The IDs of various hardware clocks:
> diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
> index 9ca4973..4517f1d 100644
> --- a/kernel/posix-timers.c
> +++ b/kernel/posix-timers.c
> @@ -37,6 +37,8 @@
>  #include <linux/mutex.h>
>  
>  #include <asm/uaccess.h>
> +#include <linux/trace-clock.h>
> +#include <linux/timer.h>
>  #include <linux/list.h>
>  #include <linux/init.h>
>  #include <linux/compiler.h>
> @@ -957,6 +959,31 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, 
> which_clock,
>  {
>       struct timespec kernel_tp;
>       int error;
> +#ifdef CONFIG_X86

ifdefs in generic code are usually frowned upon.

You probably want to modify "invalid_clockid" so it calls a per-architecture
check, which will only succeed for the x86 arch.

This way, you don't even have to bypass the currently existing test.

Thanks,

Mathieu

> +     u64 cycles, tmp, scaled_cpu_khz;
> +
> +     if (which_clock == CLOCK_TRACE) {
> +             if (!_trace_clock_is_sync)
> +                     return -EPERM;
> +
> +             scaled_cpu_khz = cpu_khz >> CYC2NS_SCALE_FACTOR;
> +
> +             rdtsc_barrier();
> +             rdtscll(cycles);
> +             rdtsc_barrier();
> +
> +#ifdef CONFIG_X86_64
> +             tmp = cycles*(USEC_PER_SEC/scaled_cpu_khz) >> 
> CYC2NS_SCALE_FACTOR;
> +             tp->tv_nsec = tmp;
> +             tp->tv_sec = 0;
> +#elif defined CONFIG_X86_32
> +             tmp = cycles*(div64_u64(USEC_PER_SEC, scaled_cpu_khz)) >> 
> CYC2NS_SCALE_FACTOR;
> +             tp->tv_sec = div64_u64(tmp, NSEC_PER_SEC);
> +             tp->tv_nsec = tmp - NSEC_PER_SEC * tp->tv_sec;
> +#endif
> +             return 0;
> +     }
> +#endif
>  
>       if (invalid_clockid(which_clock))
>               return -EINVAL;
> -- 
> 1.7.0.4
> 

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

_______________________________________________
ltt-dev mailing list
[email protected]
http://lists.casi.polymtl.ca/cgi-bin/mailman/listinfo/ltt-dev

Reply via email to