RDTSC is not a serializing instruction; to make sure we get the TSC
value corresponding to the position of RDTSC in te instruction stream
we need a barrier.  Linux uses LFENCE on machines where it is
available.  FreeBSD seems to prefer MFENCE for AMD CPUs but uses
LFENCE for Intel CPUs.  For now my thinjing is that what's good enough
for Linux should be good enough for us.  And on amd64 LFENCE is always
available.

This diff reduces the scatter in the skew values.  Before I had
occasional outliers of more than 200 cycles.  Now the maximem values I see are 
around 60 cycles.

I din't changes the rdtsc() call that reads the timecounter.  But
maybe that one should change as well?  Bit of a tradeof between
performance and accoracy I think.

This also changes the skew print message (stolen from what Theo put in
snaps).  Printing the CPU number makes it easier to get statistics for
a specific CPU.  Diff also enabled the debug message.  Maybe it should
be committed this way and then disabled again later such that we can
get some statistics?

comments?  ok?


Index: arch/amd64/amd64/tsc.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v
retrieving revision 1.16
diff -u -p -r1.16 tsc.c
--- arch/amd64/amd64/tsc.c      6 Apr 2020 00:01:08 -0000       1.16
+++ arch/amd64/amd64/tsc.c      20 Jun 2020 20:01:46 -0000
@@ -100,9 +100,9 @@ get_tsc_and_timecount(struct timecounter
        int i;
 
        for (i = 0; i < RECALIBRATE_MAX_RETRIES; i++) {
-               tsc1 = rdtsc();
+               tsc1 = rdtsc_lfence();
                n = (tc->tc_get_timecount(tc) & tc->tc_counter_mask);
-               tsc2 = rdtsc();
+               tsc2 = rdtsc_lfence();
 
                if ((tsc2 - tsc1) < RECALIBRATE_SMI_THRESHOLD) {
                        *count = n;
@@ -216,8 +216,9 @@ tsc_get_timecount(struct timecounter *tc
 void
 tsc_timecounter_init(struct cpu_info *ci, uint64_t cpufreq)
 {
+#define TSC_DEBUG
 #ifdef TSC_DEBUG
-       printf("%s: TSC skew=%lld observed drift=%lld\n", __func__,
+       printf("%s: TSC skew=%lld observed drift=%lld\n", ci->ci_dev->dv_xname,
            (long long)ci->ci_tsc_skew, (long long)tsc_drift_observed);
 #endif
 
@@ -276,12 +277,12 @@ tsc_read_bp(struct cpu_info *ci, uint64_
 
        /* Flag it and read our TSC. */
        atomic_setbits_int(&ci->ci_flags, CPUF_SYNCTSC);
-       bptsc = (rdtsc() >> 1);
+       bptsc = (rdtsc_lfence() >> 1);
 
        /* Wait for remote to complete, and read ours again. */
        while ((ci->ci_flags & CPUF_SYNCTSC) != 0)
                membar_consumer();
-       bptsc += (rdtsc() >> 1);
+       bptsc += (rdtsc_lfence() >> 1);
 
        /* Wait for the results to come in. */
        while (tsc_sync_cpu == ci)
@@ -317,11 +318,11 @@ tsc_post_ap(struct cpu_info *ci)
        /* Wait for go-ahead from primary. */
        while ((ci->ci_flags & CPUF_SYNCTSC) == 0)
                membar_consumer();
-       tsc = (rdtsc() >> 1);
+       tsc = (rdtsc_lfence() >> 1);
 
        /* Instruct primary to read its counter. */
        atomic_clearbits_int(&ci->ci_flags, CPUF_SYNCTSC);
-       tsc += (rdtsc() >> 1);
+       tsc += (rdtsc_lfence() >> 1);
 
        /* Post result.  Ensure the whole value goes out atomically. */
        (void)atomic_swap_64(&tsc_sync_val, tsc);
Index: arch/amd64/include/cpufunc.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/cpufunc.h,v
retrieving revision 1.34
diff -u -p -r1.34 cpufunc.h
--- arch/amd64/include/cpufunc.h        28 Jun 2019 21:54:05 -0000      1.34
+++ arch/amd64/include/cpufunc.h        20 Jun 2020 20:01:46 -0000
@@ -292,6 +292,15 @@ rdtsc(void)
 }
 
 static __inline u_int64_t
+rdtsc_lfence(void)
+{
+       uint32_t hi, lo;
+
+       __asm volatile("lfence; rdtsc" : "=d" (hi), "=a" (lo));
+       return (((uint64_t)hi << 32) | (uint64_t) lo);
+}
+
+static __inline u_int64_t
 rdpmc(u_int pmc)
 {
        uint32_t hi, lo;

Reply via email to