Hi, Thanks to everyone who tested v3.
Attached is v4. I would like to put this into snaps (bcc: deraadt@). If you've been following along and testing these patches, feel free to continue testing. If your results change from v3 to v4, please reply with what happened and your dmesg. I made a few small changes from v3: - Only run the sync test after failing it on TSC_DEBUG kernels. For example, it would be a waste of time to run the sync test for 62 other CPU pairs if the CPU0/CPU1 sync test failed. - Pad the tsc_test_status struct by hand. Try to keep tsc_test_status.val onto its own cache line and try to prevent one instance of the struct from sharing a cache line with another instance. I am looking for OKs. Assuming the results from snaps testing aren't catastrophic, and this version is OK'd, I hope to commit this after a couple weeks in snaps. There are two things I'm unsure about that I hope a reviewer will comment on: - Do we need to keep the double-test? IIUC the purpose of the double-test is to check for drift. But with this change we no longer have a concept of drift. - Is the LFENCE in tsc_test_ap()/tst_test_bp() sufficient to ensure one TSC value predates the other? Or do I need to insert membar_consumer()/membar_producer() calls to provide that guarantee? -Scott Index: sys/arch/amd64/amd64/tsc.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v retrieving revision 1.24 diff -u -p -r1.24 tsc.c --- sys/arch/amd64/amd64/tsc.c 31 Aug 2021 15:11:54 -0000 1.24 +++ sys/arch/amd64/amd64/tsc.c 20 Jul 2022 21:58:40 -0000 @@ -36,13 +36,6 @@ int tsc_recalibrate; uint64_t tsc_frequency; int tsc_is_invariant; -#define TSC_DRIFT_MAX 250 -#define TSC_SKEW_MAX 100 -int64_t tsc_drift_observed; - -volatile int64_t tsc_sync_val; -volatile struct cpu_info *tsc_sync_cpu; - u_int tsc_get_timecount(struct timecounter *tc); void tsc_delay(int usecs); @@ -236,22 +229,12 @@ cpu_recalibrate_tsc(struct timecounter * u_int tsc_get_timecount(struct timecounter *tc) { - return rdtsc_lfence() + curcpu()->ci_tsc_skew; + return rdtsc_lfence(); } void tsc_timecounter_init(struct cpu_info *ci, uint64_t cpufreq) { -#ifdef TSC_DEBUG - printf("%s: TSC skew=%lld observed drift=%lld\n", ci->ci_dev->dv_xname, - (long long)ci->ci_tsc_skew, (long long)tsc_drift_observed); -#endif - if (ci->ci_tsc_skew < -TSC_SKEW_MAX || ci->ci_tsc_skew > TSC_SKEW_MAX) { - printf("%s: disabling user TSC (skew=%lld)\n", - ci->ci_dev->dv_xname, (long long)ci->ci_tsc_skew); - tsc_timecounter.tc_user = 0; - } - if (!(ci->ci_flags & CPUF_PRIMARY) || !(ci->ci_flags & CPUF_CONST_TSC) || !(ci->ci_flags & CPUF_INVAR_TSC)) @@ -268,111 +251,276 @@ tsc_timecounter_init(struct cpu_info *ci calibrate_tsc_freq(); } - if (tsc_drift_observed > TSC_DRIFT_MAX) { - printf("ERROR: %lld cycle TSC drift observed\n", - (long long)tsc_drift_observed); - tsc_timecounter.tc_quality = -1000; - tsc_timecounter.tc_user = 0; - tsc_is_invariant = 0; - } - tc_init(&tsc_timecounter); } -/* - * Record drift (in clock cycles). Called during AP startup. - */ void -tsc_sync_drift(int64_t drift) +tsc_delay(int usecs) { - if (drift < 0) - drift = -drift; - if (drift > tsc_drift_observed) - tsc_drift_observed = drift; + uint64_t interval, start; + + interval = (uint64_t)usecs * tsc_frequency / 1000000; + start = rdtsc_lfence(); + while (rdtsc_lfence() - start < interval) + CPU_BUSY_CYCLE(); } +#ifdef MULTIPROCESSOR + +#define TSC_DEBUG 1 + +/* + * Protections for global variables in this code: + * + * a Modified atomically + * b Protected by a barrier + * p Only modified by the primary CPU + */ + +#define TSC_TEST_MS 1 /* Test round duration */ +#define TSC_TEST_ROUNDS 2 /* Number of test rounds */ + /* - * Called during startup of APs, by the boot processor. Interrupts - * are disabled on entry. + * tsc_test_status.val is cacheline-aligned (64-byte) to limit + * false sharing during the test and reduce our margin of error. */ +struct tsc_test_status { + volatile uint64_t val; /* [b] latest RDTSC value */ + uint64_t pad1[7]; + uint64_t lag_count; /* [b] number of lags seen by CPU */ + uint64_t lag_max; /* [b] Biggest lag seen */ + int64_t adj; /* [b] initial IA32_TSC_ADJUST value */ + uint64_t pad2[5]; +} __aligned(64); +struct tsc_test_status tsc_ap_status; /* [b] Test results from AP */ +struct tsc_test_status tsc_bp_status; /* [p] Test results from BP */ +uint64_t tsc_test_cycles; /* [p] TSC cycles per test round */ +const char *tsc_ap_name; /* [b] Name of AP */ +volatile u_int tsc_egress_barrier; /* [a] Test end barrier */ +volatile u_int tsc_ingress_barrier; /* [a] Test start barrier */ +volatile u_int tsc_test_rounds; /* [p] Remaining test rounds */ +int tsc_is_synchronized = 1; /* [p] TSC sync'd across all CPUs? */ + +void tsc_report_test_results(void); +void tsc_reset_adjust(struct tsc_test_status *); +void tsc_test_ap(void); +void tsc_test_bp(void); + void -tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp) +tsc_test_sync_bp(struct cpu_info *ci) { - uint64_t bptsc; - - if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) - panic("tsc_sync_bp: 1"); + /* TSC must be constant and invariant to use it as a timecounter. */ + if (!tsc_is_invariant) + return; +#ifndef TSC_DEBUG + /* No point in testing again if we already failed. */ + if (!tsc_is_synchronized) + return; +#endif + /* Reset IA32_TSC_ADJUST if it exists. */ + tsc_reset_adjust(&tsc_bp_status); - /* Flag it and read our TSC. */ - atomic_setbits_int(&ci->ci_flags, CPUF_SYNCTSC); - bptsc = (rdtsc_lfence() >> 1); + /* Reset the test cycle limit and round count. */ + tsc_test_cycles = TSC_TEST_MS * tsc_frequency / 1000; + tsc_test_rounds = TSC_TEST_ROUNDS; + + do { + /* + * Pass through the ingress barrier, run the test, + * then wait for the AP to reach the egress barrier. + */ + atomic_inc_int(&tsc_ingress_barrier); + while (tsc_ingress_barrier != 2) + CPU_BUSY_CYCLE(); + tsc_test_bp(); + while (tsc_egress_barrier != 1) + CPU_BUSY_CYCLE(); + + /* + * Report what happened. Adjust the timecounter quality + * if we failed the test. + * + * XXX We need a tc_detach() function to cleanly + * disable the timecounter. Lowering the quality + * like this is a nasty hack. + */ + tsc_report_test_results(); + if (tsc_ap_status.lag_count || tsc_bp_status.lag_count) { + tsc_timecounter.tc_quality = -1000; + tsc_is_synchronized = 0; + tsc_test_rounds = 0; + } else + tsc_test_rounds--; + + /* + * Clean up for the next round. + * + * It is safe to reset the ingress barrier because + * at this point we know the AP has reached the egress + * barrier. + */ + memset(&tsc_ap_status, 0, sizeof tsc_ap_status); + memset(&tsc_bp_status, 0, sizeof tsc_bp_status); + tsc_ingress_barrier = 0; + if (tsc_test_rounds == 0) + tsc_ap_name = NULL; + + /* + * Pass through the egress barrier and release the AP. + * The AP is responsible for resetting the egress barrier. + */ + if (atomic_inc_int_nv(&tsc_egress_barrier) != 2) + panic("%s: unexpected egress count", __func__); + } while (tsc_test_rounds > 0); +} - /* Wait for remote to complete, and read ours again. */ - while ((ci->ci_flags & CPUF_SYNCTSC) != 0) - membar_consumer(); - bptsc += (rdtsc_lfence() >> 1); +void +tsc_test_sync_ap(struct cpu_info *ci) +{ + if (!tsc_is_invariant) + return; +#ifndef TSC_DEBUG + if (!tsc_is_synchronized) + return; +#endif + /* The BP needs our name in order to report any problems. */ + tsc_ap_name = ci->ci_dev->dv_xname; - /* Wait for the results to come in. */ - while (tsc_sync_cpu == ci) - CPU_BUSY_CYCLE(); - if (tsc_sync_cpu != NULL) - panic("tsc_sync_bp: 2"); + tsc_reset_adjust(&tsc_ap_status); - *bptscp = bptsc; - *aptscp = tsc_sync_val; + /* + * The AP is only responsible for running the test and + * resetting the egress barrier. The BP handles everything + * else. + */ + do { + atomic_inc_int(&tsc_ingress_barrier); + while (tsc_ingress_barrier != 2) + CPU_BUSY_CYCLE(); + tsc_test_ap(); + atomic_inc_int(&tsc_egress_barrier); + while (atomic_cas_uint(&tsc_egress_barrier, 2, 0) != 2) + CPU_BUSY_CYCLE(); + } while (tsc_test_rounds > 0); } void -tsc_sync_bp(struct cpu_info *ci) +tsc_report_test_results(void) { - uint64_t bptsc, aptsc; - - tsc_read_bp(ci, &bptsc, &aptsc); /* discarded - cache effects */ - tsc_read_bp(ci, &bptsc, &aptsc); + u_int round = TSC_TEST_ROUNDS - tsc_test_rounds + 1; - /* Compute final value to adjust for skew. */ - ci->ci_tsc_skew = bptsc - aptsc; + if (tsc_bp_status.adj != 0) { + printf("tsc: cpu0: IA32_TSC_ADJUST: %lld -> 0\n", + tsc_bp_status.adj); + } + if (tsc_ap_status.adj != 0) { + printf("tsc: %s: IA32_TSC_ADJUST: %lld -> 0\n", + tsc_ap_name, tsc_ap_status.adj); + } + if (tsc_ap_status.lag_count > 0 || tsc_bp_status.lag_count > 0) { + printf("tsc: cpu0/%s: sync test round %u/%u failed\n", + tsc_ap_name, round, TSC_TEST_ROUNDS); + } + if (tsc_bp_status.lag_count > 0) { + printf("tsc: cpu0/%s: cpu0: %llu lags %llu cycles\n", + tsc_ap_name, tsc_bp_status.lag_count, + tsc_bp_status.lag_max); + } + if (tsc_ap_status.lag_count > 0) { + printf("tsc: cpu0/%s: %s: %llu lags %llu cycles\n", + tsc_ap_name, tsc_ap_name, tsc_ap_status.lag_count, + tsc_ap_status.lag_max); + } } /* - * Called during startup of AP, by the AP itself. Interrupts are - * disabled on entry. + * Reset IA32_TSC_ADJUST if we have it. + * + * XXX We should rearrange cpu_hatch() so that the feature flags + * are already set before we get here. Do CPUID() and CPUID_LEAF() + * here as a workaround until then. */ void -tsc_post_ap(struct cpu_info *ci) +tsc_reset_adjust(struct tsc_test_status *tts) { - uint64_t tsc; - - /* Wait for go-ahead from primary. */ - while ((ci->ci_flags & CPUF_SYNCTSC) == 0) - membar_consumer(); - tsc = (rdtsc_lfence() >> 1); - - /* Instruct primary to read its counter. */ - atomic_clearbits_int(&ci->ci_flags, CPUF_SYNCTSC); - tsc += (rdtsc_lfence() >> 1); - - /* Post result. Ensure the whole value goes out atomically. */ - (void)atomic_swap_64(&tsc_sync_val, tsc); + uint32_t eax, ebx, ecx, edx; - if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) - panic("tsc_sync_ap"); + CPUID(0, eax, ebx, ecx, edx); + if (eax >= 7) { + CPUID_LEAF(7, 0, eax, ebx, ecx, edx); + if (ISSET(ebx, SEFF0EBX_TSC_ADJUST)) { + tts->adj = rdmsr(MSR_TSC_ADJUST); + if (tts->adj != 0) + wrmsr(MSR_TSC_ADJUST, 0); + } + } } void -tsc_sync_ap(struct cpu_info *ci) +tsc_test_ap(void) { - tsc_post_ap(ci); - tsc_post_ap(ci); + uint64_t ap_val, bp_val, end, lag; + u_int i = 0; + + ap_val = rdtsc_lfence(); + end = ap_val + tsc_test_cycles; + while (ap_val < end) { + /* + * The LFENCE ensures bp_val predates ap_val. If ap_val + * is smaller than bp_val then the AP's TSC must lag that + * of the BP and the counters cannot be synchronized. + */ + bp_val = tsc_bp_status.val; + ap_val = rdtsc_lfence(); + tsc_ap_status.val = ap_val; + + /* + * Ensure the other CPU has a chance to run. This is + * crucial if the other CPU is our SMT sibling. SMT + * starvation can prevent this test from detecting + * relatively large lags. Eight is an arbitrary value, + * but it seems to work in practice without impacting + * the sensitivity of the test for non-sibling threads. + */ + if (++i % 8 == 0) + CPU_BUSY_CYCLE(); + + /* + * Record the magnitude of the problem if our TSC lags + * the other. + */ + if (__predict_false(ap_val < bp_val)) { + tsc_ap_status.lag_count++; + lag = bp_val - ap_val; + if (tsc_ap_status.lag_max < lag) + tsc_ap_status.lag_max = lag; + } + } } void -tsc_delay(int usecs) +tsc_test_bp(void) { - uint64_t interval, start; + uint64_t ap_val, bp_val, end, lag; + u_int i = 0; - interval = (uint64_t)usecs * tsc_frequency / 1000000; - start = rdtsc_lfence(); - while (rdtsc_lfence() - start < interval) - CPU_BUSY_CYCLE(); + bp_val = rdtsc_lfence(); + end = bp_val + tsc_test_cycles; + while (bp_val < end) { + ap_val = tsc_ap_status.val; + bp_val = rdtsc_lfence(); + tsc_bp_status.val = bp_val; + + if (++i % 8 == 0) + CPU_BUSY_CYCLE(); + + if (__predict_false(bp_val < ap_val)) { + tsc_bp_status.lag_count++; + lag = ap_val - bp_val; + if (tsc_bp_status.lag_max < lag) + tsc_bp_status.lag_max = lag; + } + } } + +#endif /* MULTIPROCESSOR */ Index: sys/arch/amd64/amd64/cpu.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v retrieving revision 1.156 diff -u -p -r1.156 cpu.c --- sys/arch/amd64/amd64/cpu.c 26 Apr 2022 08:35:30 -0000 1.156 +++ sys/arch/amd64/amd64/cpu.c 20 Jul 2022 21:58:40 -0000 @@ -772,9 +772,9 @@ cpu_init(struct cpu_info *ci) lcr4(cr4 & ~CR4_PGE); lcr4(cr4); - /* Synchronize TSC */ + /* Check if TSC is synchronized. */ if (cold && !CPU_IS_PRIMARY(ci)) - tsc_sync_ap(ci); + tsc_test_sync_ap(ci); #endif } @@ -854,18 +854,14 @@ cpu_start_secondary(struct cpu_info *ci) #endif } else { /* - * Synchronize time stamp counters. Invalidate cache and - * synchronize twice (in tsc_sync_bp) to minimize possible - * cache effects. Disable interrupts to try and rule out any - * external interference. + * Test if TSCs are synchronized. Invalidate cache to + * minimize possible cache effects. Disable interrupts to + * try to rule out external interference. */ s = intr_disable(); wbinvd(); - tsc_sync_bp(ci); + tsc_test_sync_bp(ci); intr_restore(s); -#ifdef TSC_DEBUG - printf("TSC skew=%lld\n", (long long)ci->ci_tsc_skew); -#endif } if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) { @@ -890,7 +886,6 @@ void cpu_boot_secondary(struct cpu_info *ci) { int i; - int64_t drift; u_long s; atomic_setbits_int(&ci->ci_flags, CPUF_GO); @@ -905,18 +900,11 @@ cpu_boot_secondary(struct cpu_info *ci) db_enter(); #endif } else if (cold) { - /* Synchronize TSC again, check for drift. */ - drift = ci->ci_tsc_skew; + /* Test if TSCs are synchronized again. */ s = intr_disable(); wbinvd(); - tsc_sync_bp(ci); + tsc_test_sync_bp(ci); intr_restore(s); - drift -= ci->ci_tsc_skew; -#ifdef TSC_DEBUG - printf("TSC skew=%lld drift=%lld\n", - (long long)ci->ci_tsc_skew, (long long)drift); -#endif - tsc_sync_drift(drift); } } @@ -942,13 +930,12 @@ cpu_hatch(void *v) #endif /* - * Synchronize the TSC for the first time. Note that interrupts are - * off at this point. + * Test if our TSC is synchronized for the first time. + * Note that interrupts are off at this point. */ wbinvd(); ci->ci_flags |= CPUF_PRESENT; - ci->ci_tsc_skew = 0; /* reset on resume */ - tsc_sync_ap(ci); + tsc_test_sync_ap(ci); lapic_enable(); lapic_startclock(); Index: sys/arch/amd64/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v retrieving revision 1.145 diff -u -p -r1.145 cpu.h --- sys/arch/amd64/include/cpu.h 12 Jul 2022 04:46:00 -0000 1.145 +++ sys/arch/amd64/include/cpu.h 20 Jul 2022 21:58:40 -0000 @@ -207,8 +207,6 @@ struct cpu_info { paddr_t ci_vmxon_region_pa; struct vmxon_region *ci_vmxon_region; - int64_t ci_tsc_skew; /* counter skew vs cpu0 */ - char ci_panicbuf[512]; paddr_t ci_vmcs_pa; @@ -228,7 +226,6 @@ struct cpu_info { #define CPUF_INVAR_TSC 0x0100 /* CPU has invariant TSC */ #define CPUF_USERXSTATE 0x0200 /* CPU has curproc's xsave state */ -#define CPUF_SYNCTSC 0x0800 /* Synchronize TSC */ #define CPUF_PRESENT 0x1000 /* CPU is present */ #define CPUF_RUNNING 0x2000 /* CPU is running */ #define CPUF_PAUSE 0x4000 /* CPU is paused in DDB */ Index: sys/arch/amd64/include/cpuvar.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/cpuvar.h,v retrieving revision 1.11 diff -u -p -r1.11 cpuvar.h --- sys/arch/amd64/include/cpuvar.h 16 May 2021 04:33:05 -0000 1.11 +++ sys/arch/amd64/include/cpuvar.h 20 Jul 2022 21:58:40 -0000 @@ -97,8 +97,7 @@ void identifycpu(struct cpu_info *); void cpu_init(struct cpu_info *); void cpu_init_first(void); -void tsc_sync_drift(int64_t); -void tsc_sync_bp(struct cpu_info *); -void tsc_sync_ap(struct cpu_info *); +void tsc_test_sync_bp(struct cpu_info *); +void tsc_test_sync_ap(struct cpu_info *); #endif