This new option to clock_gettime allows the user to retreive a precise timestamp using the LTTng infrastructure. If the TSC is not synchronized across the CPUs (validated at boot time) the function returns an error to the user in order to make him understand he won't be able to synchronize automatically kernel and userspace traces and he need to fallback on an other clocksource.
The main difference with using the TSC clocksource directly is that the time starts at machine boot and not at Linux boot which make it possible to correlate user and kernelspace events. On 64 bits architecture we only use the nsec field of the timespec to encode the time in nanoseconds, the sec field is always 0. On 32 bits we have to make the division and split the sec and nsec. I am not particularly proud of the way the syscall is implemented (in kernel/posix-timers.c), I would be really glad if anyone has an idea to clean that up. Some benchmarks (with 20000000 iterations reading the tsc before and after each call on an i7 920): 64 bits with vDSO average cycles for clock_realtime: 101 average cycles for clock_monotonic: 104 average cycles for clock_trace: 66 64 bits without vDSO (using syscall) average cycles for clock_realtime: 240 average cycles for clock_monotonic: 256 average cycles for clock_trace: 231 32 bits with AND without vDSO average cycles for clock_realtime: 649 average cycles for clock_monotonic: 661 average cycles for clock_trace: 630 Signed-off-by: Julien Desfossez <[email protected]> --- arch/x86/include/asm/trace-clock.h | 1 + arch/x86/include/asm/vgtod.h | 2 + arch/x86/kernel/vsyscall_64.c | 5 ++++ arch/x86/vdso/vclock_gettime.c | 44 ++++++++++++++++++++++++++++++++++++ include/linux/time.h | 1 + kernel/posix-timers.c | 27 ++++++++++++++++++++++ 6 files changed, 80 insertions(+), 0 deletions(-) diff --git a/arch/x86/include/asm/trace-clock.h b/arch/x86/include/asm/trace-clock.h index 01bc2f5..c1fd160 100644 --- a/arch/x86/include/asm/trace-clock.h +++ b/arch/x86/include/asm/trace-clock.h @@ -14,6 +14,7 @@ #include <asm/system.h> #include <asm/processor.h> #include <asm/atomic.h> +#include <asm/vgtod.h> /* Minimum duration of a probe, in cycles */ #define TRACE_CLOCK_MIN_PROBE_DURATION 200 diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 3d61e20..7367f1c 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -12,6 +12,8 @@ struct vsyscall_gtod_data { u32 wall_time_nsec; int sysctl_enabled; + int trace_clock_is_sync; + unsigned long cpu_khz; struct timezone sys_tz; struct { /* extract of a clocksource struct */ cycle_t (*vread)(void); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dcbb28c..ce750b5 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -44,6 +44,8 @@ #include <asm/desc.h> #include <asm/topology.h> #include <asm/vgtod.h> +#include <asm/trace-clock.h> +#include <asm/timer.h> #define __vsyscall(nr) \ __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace @@ -61,6 +63,7 @@ struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = { .lock = SEQLOCK_UNLOCKED, .sysctl_enabled = 1, + .trace_clock_is_sync = 0, }; void update_vsyscall_tz(void) @@ -89,6 +92,8 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = *wtm; vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + vsyscall_gtod_data.cpu_khz = cpu_khz >> CYC2NS_SCALE_FACTOR; + vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index ee55754..81bc9c7 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -22,6 +22,8 @@ #include <asm/hpet.h> #include <asm/unistd.h> #include <asm/io.h> +#include <asm/trace-clock.h> +#include <asm/timer.h> #include "vextern.h" #define gtod vdso_vsyscall_gtod_data @@ -69,6 +71,7 @@ vset_normalized_timespec(struct timespec *ts, long sec, long nsec) --sec; } ts->tv_sec = sec; + ts->tv_nsec = nsec; } @@ -111,6 +114,41 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts) return 0; } +#ifdef CONFIG_X86 +notrace static noinline int do_trace_clock(struct timespec *ts) +{ + cycle_t cycles; + unsigned long tmp; + + if (!gtod->trace_clock_is_sync) + return -EPERM; + + /* Copy of the version in kernel/tsc.c which we cannot directly access + * + * Surround the RDTSC by barriers, to make sure it's not + * speculated to outside the seqlock critical section and + * does not cause time warps: + */ + rdtsc_barrier(); + cycles = (cycle_t)vget_cycles(); + rdtsc_barrier(); + + /* On 64 bits, the nsec field is enough to store the whole information, + * on 32 bits we need to split it in two fields + */ + tmp = cycles * (USEC_PER_SEC / gtod->cpu_khz) >> CYC2NS_SCALE_FACTOR; +#ifdef CONFIG_X86_64 + ts->tv_nsec = tmp; + ts->tv_sec = 0; +#elif defined CONFIG_X86_32 + ts->tv_nsec = do_div(tmp, NSEC_PER_SEC); + ts->tv_sec = tmp; +#endif + + return 0; +} +#endif /* CONFIG_X86 */ + notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { if (likely(gtod->sysctl_enabled)) @@ -127,6 +165,12 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) return do_realtime_coarse(ts); case CLOCK_MONOTONIC_COARSE: return do_monotonic_coarse(ts); +#ifdef CONFIG_X86 + case CLOCK_TRACE: + return do_trace_clock(ts); +#endif + default: + return -EINVAL; } return vdso_fallback_gettime(clock, ts); } diff --git a/include/linux/time.h b/include/linux/time.h index 9f15ac7..9e6d9e2 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -290,6 +290,7 @@ struct itimerval { #define CLOCK_MONOTONIC_RAW 4 #define CLOCK_REALTIME_COARSE 5 #define CLOCK_MONOTONIC_COARSE 6 +#define CLOCK_TRACE 32 /* * The IDs of various hardware clocks: diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9ca4973..4517f1d 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -37,6 +37,8 @@ #include <linux/mutex.h> #include <asm/uaccess.h> +#include <linux/trace-clock.h> +#include <linux/timer.h> #include <linux/list.h> #include <linux/init.h> #include <linux/compiler.h> @@ -957,6 +959,31 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, { struct timespec kernel_tp; int error; +#ifdef CONFIG_X86 + u64 cycles, tmp, scaled_cpu_khz; + + if (which_clock == CLOCK_TRACE) { + if (!_trace_clock_is_sync) + return -EPERM; + + scaled_cpu_khz = cpu_khz >> CYC2NS_SCALE_FACTOR; + + rdtsc_barrier(); + rdtscll(cycles); + rdtsc_barrier(); + +#ifdef CONFIG_X86_64 + tmp = cycles*(USEC_PER_SEC/scaled_cpu_khz) >> CYC2NS_SCALE_FACTOR; + tp->tv_nsec = tmp; + tp->tv_sec = 0; +#elif defined CONFIG_X86_32 + tmp = cycles*(div64_u64(USEC_PER_SEC, scaled_cpu_khz)) >> CYC2NS_SCALE_FACTOR; + tp->tv_sec = div64_u64(tmp, NSEC_PER_SEC); + tp->tv_nsec = tmp - NSEC_PER_SEC * tp->tv_sec; +#endif + return 0; + } +#endif if (invalid_clockid(which_clock)) return -EINVAL; -- 1.7.0.4 _______________________________________________ ltt-dev mailing list [email protected] http://lists.casi.polymtl.ca/cgi-bin/mailman/listinfo/ltt-dev
