Hi,

Thanks to everyone who tested v3.

Attached is v4.  I would like to put this into snaps (bcc: deraadt@).

If you've been following along and testing these patches, feel free to
continue testing.  If your results change from v3 to v4, please reply
with what happened and your dmesg.

I made a few small changes from v3:

- Only run the sync test after failing it on TSC_DEBUG kernels.
  For example, it would be a waste of time to run the sync test
  for 62 other CPU pairs if the CPU0/CPU1 sync test failed.

- Pad the tsc_test_status struct by hand.  Try to keep
  tsc_test_status.val onto its own cache line and try to prevent one
  instance of the struct from sharing a cache line with another
  instance.

I am looking for OKs.

Assuming the results from snaps testing aren't catastrophic, and this
version is OK'd, I hope to commit this after a couple weeks in snaps.

There are two things I'm unsure about that I hope a reviewer will
comment on:

- Do we need to keep the double-test?  IIUC the purpose of the
  double-test is to check for drift.  But with this change we no
  longer have a concept of drift.

- Is the LFENCE in tsc_test_ap()/tst_test_bp() sufficient
  to ensure one TSC value predates the other?  Or do I need
  to insert membar_consumer()/membar_producer() calls to
  provide that guarantee?

-Scott

Index: sys/arch/amd64/amd64/tsc.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v
retrieving revision 1.24
diff -u -p -r1.24 tsc.c
--- sys/arch/amd64/amd64/tsc.c  31 Aug 2021 15:11:54 -0000      1.24
+++ sys/arch/amd64/amd64/tsc.c  20 Jul 2022 21:58:40 -0000
@@ -36,13 +36,6 @@ int          tsc_recalibrate;
 uint64_t       tsc_frequency;
 int            tsc_is_invariant;
 
-#define        TSC_DRIFT_MAX                   250
-#define TSC_SKEW_MAX                   100
-int64_t        tsc_drift_observed;
-
-volatile int64_t       tsc_sync_val;
-volatile struct cpu_info       *tsc_sync_cpu;
-
 u_int          tsc_get_timecount(struct timecounter *tc);
 void           tsc_delay(int usecs);
 
@@ -236,22 +229,12 @@ cpu_recalibrate_tsc(struct timecounter *
 u_int
 tsc_get_timecount(struct timecounter *tc)
 {
-       return rdtsc_lfence() + curcpu()->ci_tsc_skew;
+       return rdtsc_lfence();
 }
 
 void
 tsc_timecounter_init(struct cpu_info *ci, uint64_t cpufreq)
 {
-#ifdef TSC_DEBUG
-       printf("%s: TSC skew=%lld observed drift=%lld\n", ci->ci_dev->dv_xname,
-           (long long)ci->ci_tsc_skew, (long long)tsc_drift_observed);
-#endif
-       if (ci->ci_tsc_skew < -TSC_SKEW_MAX || ci->ci_tsc_skew > TSC_SKEW_MAX) {
-               printf("%s: disabling user TSC (skew=%lld)\n",
-                   ci->ci_dev->dv_xname, (long long)ci->ci_tsc_skew);
-               tsc_timecounter.tc_user = 0;
-       }
-
        if (!(ci->ci_flags & CPUF_PRIMARY) ||
            !(ci->ci_flags & CPUF_CONST_TSC) ||
            !(ci->ci_flags & CPUF_INVAR_TSC))
@@ -268,111 +251,276 @@ tsc_timecounter_init(struct cpu_info *ci
                calibrate_tsc_freq();
        }
 
-       if (tsc_drift_observed > TSC_DRIFT_MAX) {
-               printf("ERROR: %lld cycle TSC drift observed\n",
-                   (long long)tsc_drift_observed);
-               tsc_timecounter.tc_quality = -1000;
-               tsc_timecounter.tc_user = 0;
-               tsc_is_invariant = 0;
-       }
-
        tc_init(&tsc_timecounter);
 }
 
-/*
- * Record drift (in clock cycles).  Called during AP startup.
- */
 void
-tsc_sync_drift(int64_t drift)
+tsc_delay(int usecs)
 {
-       if (drift < 0)
-               drift = -drift;
-       if (drift > tsc_drift_observed)
-               tsc_drift_observed = drift;
+       uint64_t interval, start;
+
+       interval = (uint64_t)usecs * tsc_frequency / 1000000;
+       start = rdtsc_lfence();
+       while (rdtsc_lfence() - start < interval)
+               CPU_BUSY_CYCLE();
 }
 
+#ifdef MULTIPROCESSOR
+
+#define TSC_DEBUG 1
+
+/*
+ * Protections for global variables in this code:
+ *
+ *     a       Modified atomically
+ *     b       Protected by a barrier
+ *     p       Only modified by the primary CPU
+ */
+
+#define TSC_TEST_MS            1       /* Test round duration */
+#define TSC_TEST_ROUNDS                2       /* Number of test rounds */
+
 /*
- * Called during startup of APs, by the boot processor.  Interrupts
- * are disabled on entry.
+ * tsc_test_status.val is cacheline-aligned (64-byte) to limit
+ * false sharing during the test and reduce our margin of error.
  */
+struct tsc_test_status {
+       volatile uint64_t val;          /* [b] latest RDTSC value */
+       uint64_t pad1[7];
+       uint64_t lag_count;             /* [b] number of lags seen by CPU */
+       uint64_t lag_max;               /* [b] Biggest lag seen */
+       int64_t adj;                    /* [b] initial IA32_TSC_ADJUST value */
+       uint64_t pad2[5];
+} __aligned(64);
+struct tsc_test_status tsc_ap_status;  /* [b] Test results from AP */
+struct tsc_test_status tsc_bp_status;  /* [p] Test results from BP */
+uint64_t tsc_test_cycles;              /* [p] TSC cycles per test round */
+const char *tsc_ap_name;               /* [b] Name of AP */
+volatile u_int tsc_egress_barrier;     /* [a] Test end barrier */
+volatile u_int tsc_ingress_barrier;    /* [a] Test start barrier */
+volatile u_int tsc_test_rounds;                /* [p] Remaining test rounds */
+int tsc_is_synchronized = 1;           /* [p] TSC sync'd across all CPUs? */
+
+void tsc_report_test_results(void);
+void tsc_reset_adjust(struct tsc_test_status *);
+void tsc_test_ap(void);
+void tsc_test_bp(void);
+
 void
-tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp)
+tsc_test_sync_bp(struct cpu_info *ci)
 {
-       uint64_t bptsc;
-
-       if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL)
-               panic("tsc_sync_bp: 1");
+       /* TSC must be constant and invariant to use it as a timecounter. */
+       if (!tsc_is_invariant)
+               return;
+#ifndef TSC_DEBUG
+       /* No point in testing again if we already failed. */
+       if (!tsc_is_synchronized)
+               return;
+#endif
+       /* Reset IA32_TSC_ADJUST if it exists. */
+       tsc_reset_adjust(&tsc_bp_status);
 
-       /* Flag it and read our TSC. */
-       atomic_setbits_int(&ci->ci_flags, CPUF_SYNCTSC);
-       bptsc = (rdtsc_lfence() >> 1);
+       /* Reset the test cycle limit and round count. */
+       tsc_test_cycles = TSC_TEST_MS * tsc_frequency / 1000;
+       tsc_test_rounds = TSC_TEST_ROUNDS;
+
+       do {
+               /*
+                * Pass through the ingress barrier, run the test,
+                * then wait for the AP to reach the egress barrier.
+                */
+               atomic_inc_int(&tsc_ingress_barrier);
+               while (tsc_ingress_barrier != 2)
+                       CPU_BUSY_CYCLE();
+               tsc_test_bp();
+               while (tsc_egress_barrier != 1)
+                       CPU_BUSY_CYCLE();
+
+               /*
+                * Report what happened.  Adjust the timecounter quality
+                * if we failed the test.
+                *
+                * XXX We need a tc_detach() function to cleanly
+                * disable the timecounter.  Lowering the quality
+                * like this is a nasty hack.
+                */
+               tsc_report_test_results();
+               if (tsc_ap_status.lag_count || tsc_bp_status.lag_count) {
+                       tsc_timecounter.tc_quality = -1000;
+                       tsc_is_synchronized = 0;
+                       tsc_test_rounds = 0;
+               } else
+                       tsc_test_rounds--;
+
+               /*
+                * Clean up for the next round.
+                *
+                * It is safe to reset the ingress barrier because
+                * at this point we know the AP has reached the egress
+                * barrier.
+                */
+               memset(&tsc_ap_status, 0, sizeof tsc_ap_status);
+               memset(&tsc_bp_status, 0, sizeof tsc_bp_status);
+               tsc_ingress_barrier = 0;
+               if (tsc_test_rounds == 0)
+                       tsc_ap_name = NULL;
+
+               /*
+                * Pass through the egress barrier and release the AP.
+                * The AP is responsible for resetting the egress barrier.
+                */
+               if (atomic_inc_int_nv(&tsc_egress_barrier) != 2)
+                       panic("%s: unexpected egress count", __func__);
+       } while (tsc_test_rounds > 0);
+}
 
-       /* Wait for remote to complete, and read ours again. */
-       while ((ci->ci_flags & CPUF_SYNCTSC) != 0)
-               membar_consumer();
-       bptsc += (rdtsc_lfence() >> 1);
+void
+tsc_test_sync_ap(struct cpu_info *ci)
+{
+       if (!tsc_is_invariant)
+               return;
+#ifndef TSC_DEBUG
+       if (!tsc_is_synchronized)
+               return;
+#endif
+       /* The BP needs our name in order to report any problems. */
+       tsc_ap_name = ci->ci_dev->dv_xname;
 
-       /* Wait for the results to come in. */
-       while (tsc_sync_cpu == ci)
-               CPU_BUSY_CYCLE();
-       if (tsc_sync_cpu != NULL)
-               panic("tsc_sync_bp: 2");
+       tsc_reset_adjust(&tsc_ap_status);
 
-       *bptscp = bptsc;
-       *aptscp = tsc_sync_val;
+       /*
+        * The AP is only responsible for running the test and
+        * resetting the egress barrier.  The BP handles everything
+        * else.
+        */
+       do {
+               atomic_inc_int(&tsc_ingress_barrier);
+               while (tsc_ingress_barrier != 2)
+                       CPU_BUSY_CYCLE();
+               tsc_test_ap();
+               atomic_inc_int(&tsc_egress_barrier);
+               while (atomic_cas_uint(&tsc_egress_barrier, 2, 0) != 2)
+                       CPU_BUSY_CYCLE();
+       } while (tsc_test_rounds > 0);
 }
 
 void
-tsc_sync_bp(struct cpu_info *ci)
+tsc_report_test_results(void)
 {
-       uint64_t bptsc, aptsc;
-
-       tsc_read_bp(ci, &bptsc, &aptsc); /* discarded - cache effects */
-       tsc_read_bp(ci, &bptsc, &aptsc);
+       u_int round = TSC_TEST_ROUNDS - tsc_test_rounds + 1;
 
-       /* Compute final value to adjust for skew. */
-       ci->ci_tsc_skew = bptsc - aptsc;
+       if (tsc_bp_status.adj != 0) {
+               printf("tsc: cpu0: IA32_TSC_ADJUST: %lld -> 0\n",
+                   tsc_bp_status.adj);
+       }
+       if (tsc_ap_status.adj != 0) {
+               printf("tsc: %s: IA32_TSC_ADJUST: %lld -> 0\n",
+                   tsc_ap_name, tsc_ap_status.adj);
+       }
+       if (tsc_ap_status.lag_count > 0 || tsc_bp_status.lag_count > 0) {
+               printf("tsc: cpu0/%s: sync test round %u/%u failed\n",
+                   tsc_ap_name, round, TSC_TEST_ROUNDS);
+       }
+       if (tsc_bp_status.lag_count > 0) {
+               printf("tsc: cpu0/%s: cpu0: %llu lags %llu cycles\n",
+                   tsc_ap_name, tsc_bp_status.lag_count,
+                   tsc_bp_status.lag_max);
+       }
+       if (tsc_ap_status.lag_count > 0) {
+               printf("tsc: cpu0/%s: %s: %llu lags %llu cycles\n",
+                   tsc_ap_name, tsc_ap_name, tsc_ap_status.lag_count,
+                   tsc_ap_status.lag_max);
+       }
 }
 
 /*
- * Called during startup of AP, by the AP itself.  Interrupts are
- * disabled on entry.
+ * Reset IA32_TSC_ADJUST if we have it.
+ *
+ * XXX We should rearrange cpu_hatch() so that the feature flags
+ * are already set before we get here.  Do CPUID() and CPUID_LEAF()
+ * here as a workaround until then.
  */
 void
-tsc_post_ap(struct cpu_info *ci)
+tsc_reset_adjust(struct tsc_test_status *tts)
 {
-       uint64_t tsc;
-
-       /* Wait for go-ahead from primary. */
-       while ((ci->ci_flags & CPUF_SYNCTSC) == 0)
-               membar_consumer();
-       tsc = (rdtsc_lfence() >> 1);
-
-       /* Instruct primary to read its counter. */
-       atomic_clearbits_int(&ci->ci_flags, CPUF_SYNCTSC);
-       tsc += (rdtsc_lfence() >> 1);
-
-       /* Post result.  Ensure the whole value goes out atomically. */
-       (void)atomic_swap_64(&tsc_sync_val, tsc);
+       uint32_t eax, ebx, ecx, edx;
 
-       if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci)
-               panic("tsc_sync_ap");
+       CPUID(0, eax, ebx, ecx, edx);
+       if (eax >= 7) {
+               CPUID_LEAF(7, 0, eax, ebx, ecx, edx);
+               if (ISSET(ebx, SEFF0EBX_TSC_ADJUST)) {
+                       tts->adj = rdmsr(MSR_TSC_ADJUST);
+                       if (tts->adj != 0)
+                               wrmsr(MSR_TSC_ADJUST, 0);
+               }
+       }
 }
 
 void
-tsc_sync_ap(struct cpu_info *ci)
+tsc_test_ap(void)
 {
-       tsc_post_ap(ci);
-       tsc_post_ap(ci);
+       uint64_t ap_val, bp_val, end, lag;
+       u_int i = 0;
+
+       ap_val = rdtsc_lfence();
+       end = ap_val + tsc_test_cycles;
+       while (ap_val < end) {
+               /*
+                * The LFENCE ensures bp_val predates ap_val.  If ap_val
+                * is smaller than bp_val then the AP's TSC must lag that
+                * of the BP and the counters cannot be synchronized.
+                */
+               bp_val = tsc_bp_status.val;
+               ap_val = rdtsc_lfence();
+               tsc_ap_status.val = ap_val;
+
+               /*
+                * Ensure the other CPU has a chance to run.  This is
+                * crucial if the other CPU is our SMT sibling.  SMT
+                * starvation can prevent this test from detecting
+                * relatively large lags.  Eight is an arbitrary value,
+                * but it seems to work in practice without impacting
+                * the sensitivity of the test for non-sibling threads.
+                */
+               if (++i % 8 == 0)
+                       CPU_BUSY_CYCLE();
+
+               /*
+                * Record the magnitude of the problem if our TSC lags
+                * the other.
+                */
+               if (__predict_false(ap_val < bp_val)) {
+                       tsc_ap_status.lag_count++;
+                       lag = bp_val - ap_val;
+                       if (tsc_ap_status.lag_max < lag)
+                               tsc_ap_status.lag_max = lag;
+               }
+       }
 }
 
 void
-tsc_delay(int usecs)
+tsc_test_bp(void)
 {
-       uint64_t interval, start;
+       uint64_t ap_val, bp_val, end, lag;
+       u_int i = 0;
 
-       interval = (uint64_t)usecs * tsc_frequency / 1000000;
-       start = rdtsc_lfence();
-       while (rdtsc_lfence() - start < interval)
-               CPU_BUSY_CYCLE();
+       bp_val = rdtsc_lfence();
+       end = bp_val + tsc_test_cycles;
+       while (bp_val < end) {
+               ap_val = tsc_ap_status.val;
+               bp_val = rdtsc_lfence();
+               tsc_bp_status.val = bp_val;
+
+               if (++i % 8 == 0)
+                       CPU_BUSY_CYCLE();
+
+               if (__predict_false(bp_val < ap_val)) {
+                       tsc_bp_status.lag_count++;
+                       lag = ap_val - bp_val;
+                       if (tsc_bp_status.lag_max < lag)
+                               tsc_bp_status.lag_max = lag;
+               }
+       }
 }
+
+#endif /* MULTIPROCESSOR */
Index: sys/arch/amd64/amd64/cpu.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v
retrieving revision 1.156
diff -u -p -r1.156 cpu.c
--- sys/arch/amd64/amd64/cpu.c  26 Apr 2022 08:35:30 -0000      1.156
+++ sys/arch/amd64/amd64/cpu.c  20 Jul 2022 21:58:40 -0000
@@ -772,9 +772,9 @@ cpu_init(struct cpu_info *ci)
        lcr4(cr4 & ~CR4_PGE);
        lcr4(cr4);
 
-       /* Synchronize TSC */
+       /* Check if TSC is synchronized. */
        if (cold && !CPU_IS_PRIMARY(ci))
-             tsc_sync_ap(ci);
+             tsc_test_sync_ap(ci);
 #endif
 }
 
@@ -854,18 +854,14 @@ cpu_start_secondary(struct cpu_info *ci)
 #endif
        } else {
                /*
-                * Synchronize time stamp counters. Invalidate cache and
-                * synchronize twice (in tsc_sync_bp) to minimize possible
-                * cache effects. Disable interrupts to try and rule out any
-                * external interference.
+                * Test if TSCs are synchronized.  Invalidate cache to
+                * minimize possible cache effects.  Disable interrupts to
+                * try to rule out external interference.
                 */
                s = intr_disable();
                wbinvd();
-               tsc_sync_bp(ci);
+               tsc_test_sync_bp(ci);
                intr_restore(s);
-#ifdef TSC_DEBUG
-               printf("TSC skew=%lld\n", (long long)ci->ci_tsc_skew);
-#endif
        }
 
        if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) {
@@ -890,7 +886,6 @@ void
 cpu_boot_secondary(struct cpu_info *ci)
 {
        int i;
-       int64_t drift;
        u_long s;
 
        atomic_setbits_int(&ci->ci_flags, CPUF_GO);
@@ -905,18 +900,11 @@ cpu_boot_secondary(struct cpu_info *ci)
                db_enter();
 #endif
        } else if (cold) {
-               /* Synchronize TSC again, check for drift. */
-               drift = ci->ci_tsc_skew;
+               /* Test if TSCs are synchronized again. */
                s = intr_disable();
                wbinvd();
-               tsc_sync_bp(ci);
+               tsc_test_sync_bp(ci);
                intr_restore(s);
-               drift -= ci->ci_tsc_skew;
-#ifdef TSC_DEBUG
-               printf("TSC skew=%lld drift=%lld\n",
-                   (long long)ci->ci_tsc_skew, (long long)drift);
-#endif
-               tsc_sync_drift(drift);
        }
 }
 
@@ -942,13 +930,12 @@ cpu_hatch(void *v)
 #endif
 
        /*
-        * Synchronize the TSC for the first time. Note that interrupts are
-        * off at this point.
+        * Test if our TSC is synchronized for the first time.
+        * Note that interrupts are off at this point.
         */
        wbinvd();
        ci->ci_flags |= CPUF_PRESENT;
-       ci->ci_tsc_skew = 0;    /* reset on resume */
-       tsc_sync_ap(ci);
+       tsc_test_sync_ap(ci);
 
        lapic_enable();
        lapic_startclock();
Index: sys/arch/amd64/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v
retrieving revision 1.145
diff -u -p -r1.145 cpu.h
--- sys/arch/amd64/include/cpu.h        12 Jul 2022 04:46:00 -0000      1.145
+++ sys/arch/amd64/include/cpu.h        20 Jul 2022 21:58:40 -0000
@@ -207,8 +207,6 @@ struct cpu_info {
        paddr_t         ci_vmxon_region_pa;
        struct vmxon_region *ci_vmxon_region;
 
-       int64_t         ci_tsc_skew;            /* counter skew vs cpu0 */
-
        char            ci_panicbuf[512];
 
        paddr_t         ci_vmcs_pa;
@@ -228,7 +226,6 @@ struct cpu_info {
 #define CPUF_INVAR_TSC 0x0100          /* CPU has invariant TSC */
 #define CPUF_USERXSTATE        0x0200          /* CPU has curproc's xsave 
state */
 
-#define CPUF_SYNCTSC   0x0800          /* Synchronize TSC */
 #define CPUF_PRESENT   0x1000          /* CPU is present */
 #define CPUF_RUNNING   0x2000          /* CPU is running */
 #define CPUF_PAUSE     0x4000          /* CPU is paused in DDB */
Index: sys/arch/amd64/include/cpuvar.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/cpuvar.h,v
retrieving revision 1.11
diff -u -p -r1.11 cpuvar.h
--- sys/arch/amd64/include/cpuvar.h     16 May 2021 04:33:05 -0000      1.11
+++ sys/arch/amd64/include/cpuvar.h     20 Jul 2022 21:58:40 -0000
@@ -97,8 +97,7 @@ void identifycpu(struct cpu_info *);
 void cpu_init(struct cpu_info *);
 void cpu_init_first(void);
 
-void tsc_sync_drift(int64_t);
-void tsc_sync_bp(struct cpu_info *);
-void tsc_sync_ap(struct cpu_info *);
+void tsc_test_sync_bp(struct cpu_info *);
+void tsc_test_sync_ap(struct cpu_info *);
 
 #endif

Reply via email to