* Julien Desfossez ([email protected]) wrote: > This new option to clock_gettime allows the user to retreive a precise > timestamp using the LTTng infrastructure. If the TSC is not synchronized > across the CPUs (validated at boot time) the function returns an error > to the user in order to make him understand he won't be able to > synchronize automatically kernel and userspace traces and he need to > fallback on an other clocksource.
I'm very happy to see the ball rolling! Comments below, > > The main difference with using the TSC clocksource directly is that the > time starts at machine boot and not at Linux boot which make it possible > to correlate user and kernelspace events. > On 64 bits architecture we only use the nsec field of the timespec to > encode the time in nanoseconds, the sec field is always 0. > On 32 bits we have to make the division and split the sec and nsec. > > I am not particularly proud of the way the syscall is implemented (in > kernel/posix-timers.c), I would be really glad if anyone has an idea to > clean that up. > > Some benchmarks (with 20000000 iterations reading the tsc before and after > each > call on an i7 920): > > 64 bits with vDSO > average cycles for clock_realtime: 101 > average cycles for clock_monotonic: 104 > average cycles for clock_trace: 66 > > 64 bits without vDSO (using syscall) > average cycles for clock_realtime: 240 > average cycles for clock_monotonic: 256 > average cycles for clock_trace: 231 > > 32 bits with AND without vDSO > average cycles for clock_realtime: 649 > average cycles for clock_monotonic: 661 > average cycles for clock_trace: 630 You always fallback on syscall on 32-bit ? > > Signed-off-by: Julien Desfossez <[email protected]> > --- > arch/x86/include/asm/trace-clock.h | 1 + > arch/x86/include/asm/vgtod.h | 2 + > arch/x86/kernel/vsyscall_64.c | 5 ++++ > arch/x86/vdso/vclock_gettime.c | 44 > ++++++++++++++++++++++++++++++++++++ > include/linux/time.h | 1 + > kernel/posix-timers.c | 27 ++++++++++++++++++++++ > 6 files changed, 80 insertions(+), 0 deletions(-) > > diff --git a/arch/x86/include/asm/trace-clock.h > b/arch/x86/include/asm/trace-clock.h > index 01bc2f5..c1fd160 100644 > --- a/arch/x86/include/asm/trace-clock.h > +++ b/arch/x86/include/asm/trace-clock.h > @@ -14,6 +14,7 @@ > #include <asm/system.h> > #include <asm/processor.h> > #include <asm/atomic.h> > +#include <asm/vgtod.h> > > /* Minimum duration of a probe, in cycles */ > #define TRACE_CLOCK_MIN_PROBE_DURATION 200 > diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h > index 3d61e20..7367f1c 100644 > --- a/arch/x86/include/asm/vgtod.h > +++ b/arch/x86/include/asm/vgtod.h > @@ -12,6 +12,8 @@ struct vsyscall_gtod_data { > u32 wall_time_nsec; > > int sysctl_enabled; > + int trace_clock_is_sync; > + unsigned long cpu_khz; > struct timezone sys_tz; > struct { /* extract of a clocksource struct */ > cycle_t (*vread)(void); > diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c > index dcbb28c..ce750b5 100644 > --- a/arch/x86/kernel/vsyscall_64.c > +++ b/arch/x86/kernel/vsyscall_64.c > @@ -44,6 +44,8 @@ > #include <asm/desc.h> > #include <asm/topology.h> > #include <asm/vgtod.h> > +#include <asm/trace-clock.h> > +#include <asm/timer.h> > > #define __vsyscall(nr) \ > __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace > @@ -61,6 +63,7 @@ struct vsyscall_gtod_data __vsyscall_gtod_data > __section_vsyscall_gtod_data = > { > .lock = SEQLOCK_UNLOCKED, > .sysctl_enabled = 1, > + .trace_clock_is_sync = 0, > }; > > void update_vsyscall_tz(void) > @@ -89,6 +92,8 @@ void update_vsyscall(struct timespec *wall_time, struct > timespec *wtm, > vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; > vsyscall_gtod_data.wall_to_monotonic = *wtm; > vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); > + vsyscall_gtod_data.cpu_khz = cpu_khz >> CYC2NS_SCALE_FACTOR; > + vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync; > write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); > } > > diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c > index ee55754..81bc9c7 100644 > --- a/arch/x86/vdso/vclock_gettime.c > +++ b/arch/x86/vdso/vclock_gettime.c > @@ -22,6 +22,8 @@ > #include <asm/hpet.h> > #include <asm/unistd.h> > #include <asm/io.h> > +#include <asm/trace-clock.h> > +#include <asm/timer.h> > #include "vextern.h" > > #define gtod vdso_vsyscall_gtod_data > @@ -69,6 +71,7 @@ vset_normalized_timespec(struct timespec *ts, long sec, > long nsec) > --sec; > } > ts->tv_sec = sec; > + extra whitespace to remove. > ts->tv_nsec = nsec; > } > > @@ -111,6 +114,41 @@ notrace static noinline int do_monotonic_coarse(struct > timespec *ts) > return 0; > } > > +#ifdef CONFIG_X86 > +notrace static noinline int do_trace_clock(struct timespec *ts) > +{ > + cycle_t cycles; > + unsigned long tmp; > + > + if (!gtod->trace_clock_is_sync) > + return -EPERM; > + > + /* Copy of the version in kernel/tsc.c which we cannot directly access > + * > + * Surround the RDTSC by barriers, to make sure it's not > + * speculated to outside the seqlock critical section and > + * does not cause time warps: > + */ > + rdtsc_barrier(); > + cycles = (cycle_t)vget_cycles(); > + rdtsc_barrier(); > + > + /* On 64 bits, the nsec field is enough to store the whole information, > + * on 32 bits we need to split it in two fields > + */ > + tmp = cycles * (USEC_PER_SEC / gtod->cpu_khz) >> CYC2NS_SCALE_FACTOR; I'd go for creation of: union aliased_ts { struct timespec ts; u64 count; }; and then: union aliased_ts *ats = ts; ats->count = tmp; both for x86 32 and 64. Then you cast the returned struct timespec (in user-space) into the same aliased structure. Does it seem clean enough ? The goal is to trash the code below, especially the division. > +#ifdef CONFIG_X86_64 > + ts->tv_nsec = tmp; > + ts->tv_sec = 0; > +#elif defined CONFIG_X86_32 > + ts->tv_nsec = do_div(tmp, NSEC_PER_SEC); > + ts->tv_sec = tmp; > +#endif > + > + return 0; > +} > +#endif /* CONFIG_X86 */ > + > notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) > { > if (likely(gtod->sysctl_enabled)) > @@ -127,6 +165,12 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct > timespec *ts) > return do_realtime_coarse(ts); > case CLOCK_MONOTONIC_COARSE: > return do_monotonic_coarse(ts); OK, so on kernels not supporting our clock ID, we can detect this because "-EINVAL" is returned. Yep, looks good. > +#ifdef CONFIG_X86 > + case CLOCK_TRACE: > + return do_trace_clock(ts); > +#endif > + default: > + return -EINVAL; > } > return vdso_fallback_gettime(clock, ts); > } > diff --git a/include/linux/time.h b/include/linux/time.h > index 9f15ac7..9e6d9e2 100644 > --- a/include/linux/time.h > +++ b/include/linux/time.h > @@ -290,6 +290,7 @@ struct itimerval { > #define CLOCK_MONOTONIC_RAW 4 > #define CLOCK_REALTIME_COARSE 5 > #define CLOCK_MONOTONIC_COARSE 6 > +#define CLOCK_TRACE 32 > > /* > * The IDs of various hardware clocks: > diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c > index 9ca4973..4517f1d 100644 > --- a/kernel/posix-timers.c > +++ b/kernel/posix-timers.c > @@ -37,6 +37,8 @@ > #include <linux/mutex.h> > > #include <asm/uaccess.h> > +#include <linux/trace-clock.h> > +#include <linux/timer.h> > #include <linux/list.h> > #include <linux/init.h> > #include <linux/compiler.h> > @@ -957,6 +959,31 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, > which_clock, > { > struct timespec kernel_tp; > int error; > +#ifdef CONFIG_X86 ifdefs in generic code are usually frowned upon. You probably want to modify "invalid_clockid" so it calls a per-architecture check, which will only succeed for the x86 arch. This way, you don't even have to bypass the currently existing test. Thanks, Mathieu > + u64 cycles, tmp, scaled_cpu_khz; > + > + if (which_clock == CLOCK_TRACE) { > + if (!_trace_clock_is_sync) > + return -EPERM; > + > + scaled_cpu_khz = cpu_khz >> CYC2NS_SCALE_FACTOR; > + > + rdtsc_barrier(); > + rdtscll(cycles); > + rdtsc_barrier(); > + > +#ifdef CONFIG_X86_64 > + tmp = cycles*(USEC_PER_SEC/scaled_cpu_khz) >> > CYC2NS_SCALE_FACTOR; > + tp->tv_nsec = tmp; > + tp->tv_sec = 0; > +#elif defined CONFIG_X86_32 > + tmp = cycles*(div64_u64(USEC_PER_SEC, scaled_cpu_khz)) >> > CYC2NS_SCALE_FACTOR; > + tp->tv_sec = div64_u64(tmp, NSEC_PER_SEC); > + tp->tv_nsec = tmp - NSEC_PER_SEC * tp->tv_sec; > +#endif > + return 0; > + } > +#endif > > if (invalid_clockid(which_clock)) > return -EINVAL; > -- > 1.7.0.4 > -- Mathieu Desnoyers Operating System Efficiency R&D Consultant EfficiOS Inc. http://www.efficios.com _______________________________________________ ltt-dev mailing list [email protected] http://lists.casi.polymtl.ca/cgi-bin/mailman/listinfo/ltt-dev
