On Thu, Jul 28, 2022 at 04:57:41PM -0400, Dave Voutila wrote: > > Stuart Henderson <s...@spacehopper.org> writes: > > > On 2022/07/28 12:57, Scott Cheloha wrote: > >> On Thu, Jul 28, 2022 at 07:55:40AM -0400, Dave Voutila wrote: > >> > > >> > This is breaking timecounter selection on my x13 Ryzen 5 Pro laptop > >> > running the latest kernel from snaps. > >> > >> Define "breaking". > > > > That's clear from the output: > > > > : On 2022/07/28 07:55, Dave Voutila wrote: > > : > $ sysctl -a | grep tsc > > : > kern.timecounter.choice=i8254(0) tsc(-1000) acpihpet0(1000) > > : > acpitimer0(1000) > > : > machdep.tscfreq=2096064730 > > : > machdep.invarianttsc=1 > > : > > > : > $ sysctl kern.timecounter > > : > kern.timecounter.tick=1 > > : > kern.timecounter.timestepwarnings=0 > > : > kern.timecounter.hardware=i8254 > > : > kern.timecounter.choice=i8254(0) tsc(-1000) acpihpet0(1000) > > : > acpitimer0(1000) > > > >> The code detects TSC desync and marks the timecounter non-monotonic. > > > > That's good (and I think as would have happened before) > > > >> So it uses the i8254 instead. > > > > But that's not so good, there are higher prio timecounters available, > > acpihpet0 and acpitimer0, which would be better choices than i8254. > > Exactly my point. Thanks Stuart.
Okay, please try this patch on the machine in question. It adds a tc_detach() function to kern_tc.c. The first time we fail the sync test, the BP calls tc_detach(), changes the TSC's tc_quality to a negative value to tell everyone "this is not monotonic", then reinstalls the TSC timecounter again with tc_init(). Because we are making this call *once*, from one place, I do not think the O(n) removal time matters, so I have not switched the tc_list from SLIST to TAILQ. It is possible for a thread to be asleep in sysctl_tc_hardware() during resume, but the thread would be done iterating through the list if it had reached rw_enter_write(), so removing/adding tsc_timecounter to the list during resume cannot break list traversal. Switching the active timecounter during resume is also fine. The only race is with tc_adjfreq(). If a thread is asleep in adjfreq(2) when the system suspends, and we change the active timecounter during resume, the frequency change may be applied to the "wrong" timecounter. ... but this is always a race, because adjfreq(2) only operates on the active timecounter, and root can change it at any time via sysctl(2). So it's not a new problem. ... It might be simpler to just change tc_lock from a rwlock to a mutex. Then the MP analysis is much simpler across a suspend/resume. Index: sys/arch/amd64/amd64/tsc.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v retrieving revision 1.24 diff -u -p -r1.24 tsc.c --- sys/arch/amd64/amd64/tsc.c 31 Aug 2021 15:11:54 -0000 1.24 +++ sys/arch/amd64/amd64/tsc.c 29 Jul 2022 01:06:17 -0000 @@ -36,13 +36,6 @@ int tsc_recalibrate; uint64_t tsc_frequency; int tsc_is_invariant; -#define TSC_DRIFT_MAX 250 -#define TSC_SKEW_MAX 100 -int64_t tsc_drift_observed; - -volatile int64_t tsc_sync_val; -volatile struct cpu_info *tsc_sync_cpu; - u_int tsc_get_timecount(struct timecounter *tc); void tsc_delay(int usecs); @@ -236,22 +229,12 @@ cpu_recalibrate_tsc(struct timecounter * u_int tsc_get_timecount(struct timecounter *tc) { - return rdtsc_lfence() + curcpu()->ci_tsc_skew; + return rdtsc_lfence(); } void tsc_timecounter_init(struct cpu_info *ci, uint64_t cpufreq) { -#ifdef TSC_DEBUG - printf("%s: TSC skew=%lld observed drift=%lld\n", ci->ci_dev->dv_xname, - (long long)ci->ci_tsc_skew, (long long)tsc_drift_observed); -#endif - if (ci->ci_tsc_skew < -TSC_SKEW_MAX || ci->ci_tsc_skew > TSC_SKEW_MAX) { - printf("%s: disabling user TSC (skew=%lld)\n", - ci->ci_dev->dv_xname, (long long)ci->ci_tsc_skew); - tsc_timecounter.tc_user = 0; - } - if (!(ci->ci_flags & CPUF_PRIMARY) || !(ci->ci_flags & CPUF_CONST_TSC) || !(ci->ci_flags & CPUF_INVAR_TSC)) @@ -268,111 +251,276 @@ tsc_timecounter_init(struct cpu_info *ci calibrate_tsc_freq(); } - if (tsc_drift_observed > TSC_DRIFT_MAX) { - printf("ERROR: %lld cycle TSC drift observed\n", - (long long)tsc_drift_observed); - tsc_timecounter.tc_quality = -1000; - tsc_timecounter.tc_user = 0; - tsc_is_invariant = 0; - } - tc_init(&tsc_timecounter); } -/* - * Record drift (in clock cycles). Called during AP startup. - */ void -tsc_sync_drift(int64_t drift) +tsc_delay(int usecs) { - if (drift < 0) - drift = -drift; - if (drift > tsc_drift_observed) - tsc_drift_observed = drift; + uint64_t interval, start; + + interval = (uint64_t)usecs * tsc_frequency / 1000000; + start = rdtsc_lfence(); + while (rdtsc_lfence() - start < interval) + CPU_BUSY_CYCLE(); } +#ifdef MULTIPROCESSOR + +#define TSC_DEBUG 1 + +/* + * Protections for global variables in this code: + * + * a Modified atomically + * b Protected by a barrier + * p Only modified by the primary CPU + */ + +#define TSC_TEST_MS 1 /* Test round duration */ +#define TSC_TEST_ROUNDS 2 /* Number of test rounds */ + /* - * Called during startup of APs, by the boot processor. Interrupts - * are disabled on entry. + * tsc_test_status.val is isolated to its own cache line to limit + * false sharing and reduce the test's margin of error. */ +struct tsc_test_status { + volatile uint64_t val; /* [b] Latest RDTSC value */ + uint64_t pad1[7]; + uint64_t lag_count; /* [b] Number of lags seen by CPU */ + uint64_t lag_max; /* [b] Biggest lag seen by CPU */ + int64_t adj; /* [b] Initial IA32_TSC_ADJUST value */ + uint64_t pad2[5]; +} __aligned(64); +struct tsc_test_status tsc_ap_status; /* [b] Test results from AP */ +struct tsc_test_status tsc_bp_status; /* [p] Test results from BP */ +uint64_t tsc_test_cycles; /* [p] TSC cycles per test round */ +const char *tsc_ap_name; /* [b] Name of AP running test */ +volatile u_int tsc_egress_barrier; /* [a] Test end barrier */ +volatile u_int tsc_ingress_barrier; /* [a] Test start barrier */ +volatile u_int tsc_test_rounds; /* [p] Remaining test rounds */ +int tsc_is_synchronized = 1; /* [p] TSC sync'd across all CPUs? */ + +void tsc_report_test_results(void); +void tsc_reset_adjust(struct tsc_test_status *); +void tsc_test_ap(void); +void tsc_test_bp(void); + void -tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp) +tsc_test_sync_bp(struct cpu_info *ci) { - uint64_t bptsc; - - if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) - panic("tsc_sync_bp: 1"); + if (!tsc_is_invariant) + return; +#ifndef TSC_DEBUG + /* No point in testing again if we already failed. */ + if (!tsc_is_synchronized) + return; +#endif + /* Reset IA32_TSC_ADJUST if it exists. */ + tsc_reset_adjust(&tsc_bp_status); - /* Flag it and read our TSC. */ - atomic_setbits_int(&ci->ci_flags, CPUF_SYNCTSC); - bptsc = (rdtsc_lfence() >> 1); + /* Reset the test cycle limit and round count. */ + tsc_test_cycles = TSC_TEST_MS * tsc_frequency / 1000; + tsc_test_rounds = TSC_TEST_ROUNDS; + + do { + /* + * Pass through the ingress barrier, run the test, + * then wait for the AP to reach the egress barrier. + */ + atomic_inc_int(&tsc_ingress_barrier); + while (tsc_ingress_barrier != 2) + CPU_BUSY_CYCLE(); + tsc_test_bp(); + while (tsc_egress_barrier != 1) + CPU_BUSY_CYCLE(); + + /* + * Report what happened. Adjust the TSC's tc_quality + * if this is the first time we've failed the test. + */ + tsc_report_test_results(); + if (tsc_ap_status.lag_count || tsc_bp_status.lag_count) { + if (tsc_is_synchronized) { + tsc_is_synchronized = 0; + tc_detach(&tsc_timecounter); + tsc_timecounter.tc_quality = -1000; + tc_init(&tsc_timecounter); + } + tsc_test_rounds = 0; + } else + tsc_test_rounds--; + + /* + * Clean up for the next round. It is safe to reset the + * ingress barrier because at this point we know the AP + * has reached the egress barrier. + */ + memset(&tsc_ap_status, 0, sizeof tsc_ap_status); + memset(&tsc_bp_status, 0, sizeof tsc_bp_status); + tsc_ingress_barrier = 0; + if (tsc_test_rounds == 0) + tsc_ap_name = NULL; + + /* + * Pass through the egress barrier and release the AP. + * The AP is responsible for resetting the egress barrier. + */ + if (atomic_inc_int_nv(&tsc_egress_barrier) != 2) + panic("%s: unexpected egress count", __func__); + } while (tsc_test_rounds > 0); +} - /* Wait for remote to complete, and read ours again. */ - while ((ci->ci_flags & CPUF_SYNCTSC) != 0) - membar_consumer(); - bptsc += (rdtsc_lfence() >> 1); +void +tsc_test_sync_ap(struct cpu_info *ci) +{ + if (!tsc_is_invariant) + return; +#ifndef TSC_DEBUG + if (!tsc_is_synchronized) + return; +#endif + /* The BP needs our name in order to report any problems. */ + if (atomic_cas_ptr(&tsc_ap_name, NULL, ci->ci_dev->dv_xname) != NULL) { + panic("%s: %s: tsc_ap_name is not NULL: %s", + __func__, ci->ci_dev->dv_xname, tsc_ap_name); + } - /* Wait for the results to come in. */ - while (tsc_sync_cpu == ci) - CPU_BUSY_CYCLE(); - if (tsc_sync_cpu != NULL) - panic("tsc_sync_bp: 2"); + tsc_reset_adjust(&tsc_ap_status); - *bptscp = bptsc; - *aptscp = tsc_sync_val; + /* + * The AP is only responsible for running the test and + * resetting the egress barrier. The BP handles everything + * else. + */ + do { + atomic_inc_int(&tsc_ingress_barrier); + while (tsc_ingress_barrier != 2) + CPU_BUSY_CYCLE(); + tsc_test_ap(); + atomic_inc_int(&tsc_egress_barrier); + while (atomic_cas_uint(&tsc_egress_barrier, 2, 0) != 2) + CPU_BUSY_CYCLE(); + } while (tsc_test_rounds > 0); } void -tsc_sync_bp(struct cpu_info *ci) +tsc_report_test_results(void) { - uint64_t bptsc, aptsc; + u_int round = TSC_TEST_ROUNDS - tsc_test_rounds + 1; - tsc_read_bp(ci, &bptsc, &aptsc); /* discarded - cache effects */ - tsc_read_bp(ci, &bptsc, &aptsc); - - /* Compute final value to adjust for skew. */ - ci->ci_tsc_skew = bptsc - aptsc; + if (tsc_bp_status.adj != 0) { + printf("tsc: cpu0: IA32_TSC_ADJUST: %lld -> 0\n", + tsc_bp_status.adj); + } + if (tsc_ap_status.adj != 0) { + printf("tsc: %s: IA32_TSC_ADJUST: %lld -> 0\n", + tsc_ap_name, tsc_ap_status.adj); + } + if (tsc_ap_status.lag_count > 0 || tsc_bp_status.lag_count > 0) { + printf("tsc: cpu0/%s: sync test round %u/%u failed\n", + tsc_ap_name, round, TSC_TEST_ROUNDS); + } + if (tsc_bp_status.lag_count > 0) { + printf("tsc: cpu0/%s: cpu0: %llu lags %llu cycles\n", + tsc_ap_name, tsc_bp_status.lag_count, + tsc_bp_status.lag_max); + } + if (tsc_ap_status.lag_count > 0) { + printf("tsc: cpu0/%s: %s: %llu lags %llu cycles\n", + tsc_ap_name, tsc_ap_name, tsc_ap_status.lag_count, + tsc_ap_status.lag_max); + } } /* - * Called during startup of AP, by the AP itself. Interrupts are - * disabled on entry. + * Reset IA32_TSC_ADJUST if we have it. + * + * XXX We should rearrange cpu_hatch() so that the feature + * flags are already set before we get here. Check CPUID + * by hand until then. */ void -tsc_post_ap(struct cpu_info *ci) +tsc_reset_adjust(struct tsc_test_status *tts) { - uint64_t tsc; - - /* Wait for go-ahead from primary. */ - while ((ci->ci_flags & CPUF_SYNCTSC) == 0) - membar_consumer(); - tsc = (rdtsc_lfence() >> 1); - - /* Instruct primary to read its counter. */ - atomic_clearbits_int(&ci->ci_flags, CPUF_SYNCTSC); - tsc += (rdtsc_lfence() >> 1); - - /* Post result. Ensure the whole value goes out atomically. */ - (void)atomic_swap_64(&tsc_sync_val, tsc); + uint32_t eax, ebx, ecx, edx; - if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) - panic("tsc_sync_ap"); + CPUID(0, eax, ebx, ecx, edx); + if (eax >= 7) { + CPUID_LEAF(7, 0, eax, ebx, ecx, edx); + if (ISSET(ebx, SEFF0EBX_TSC_ADJUST)) { + tts->adj = rdmsr(MSR_TSC_ADJUST); + if (tts->adj != 0) + wrmsr(MSR_TSC_ADJUST, 0); + } + } } void -tsc_sync_ap(struct cpu_info *ci) +tsc_test_ap(void) { - tsc_post_ap(ci); - tsc_post_ap(ci); + uint64_t ap_val, bp_val, end, lag; + u_int i = 0; + + ap_val = rdtsc_lfence(); + end = ap_val + tsc_test_cycles; + while (ap_val < end) { + /* + * The LFENCE ensures bp_val predates ap_val. If ap_val + * is smaller than bp_val then the AP's TSC must lag that + * of the BP and the counters cannot be synchronized. + */ + bp_val = tsc_bp_status.val; + ap_val = rdtsc_lfence(); + tsc_ap_status.val = ap_val; + + /* + * Ensure the other CPU has a chance to run. This is + * crucial if the other CPU is our SMT sibling. SMT + * starvation can prevent this test from detecting + * relatively large lags. Eight is an arbitrary value, + * but it seems to work in practice without impacting + * the sensitivity of the test for non-sibling threads. + */ + if (++i % 8 == 0) + CPU_BUSY_CYCLE(); + + /* + * Record the magnitude of the problem if our TSC lags + * the other. + */ + if (ap_val < bp_val) { + tsc_ap_status.lag_count++; + lag = bp_val - ap_val; + if (tsc_ap_status.lag_max < lag) + tsc_ap_status.lag_max = lag; + } + } } void -tsc_delay(int usecs) +tsc_test_bp(void) { - uint64_t interval, start; + uint64_t ap_val, bp_val, end, lag; + u_int i = 0; - interval = (uint64_t)usecs * tsc_frequency / 1000000; - start = rdtsc_lfence(); - while (rdtsc_lfence() - start < interval) - CPU_BUSY_CYCLE(); + bp_val = rdtsc_lfence(); + end = bp_val + tsc_test_cycles; + while (bp_val < end) { + ap_val = tsc_ap_status.val; + bp_val = rdtsc_lfence(); + tsc_bp_status.val = bp_val; + + if (++i % 8 == 0) + CPU_BUSY_CYCLE(); + + if (bp_val < ap_val) { + tsc_bp_status.lag_count++; + lag = ap_val - bp_val; + if (tsc_bp_status.lag_max < lag) + tsc_bp_status.lag_max = lag; + } + } } + +#endif /* MULTIPROCESSOR */ Index: sys/arch/amd64/amd64/cpu.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v retrieving revision 1.156 diff -u -p -r1.156 cpu.c --- sys/arch/amd64/amd64/cpu.c 26 Apr 2022 08:35:30 -0000 1.156 +++ sys/arch/amd64/amd64/cpu.c 29 Jul 2022 01:06:17 -0000 @@ -772,9 +772,9 @@ cpu_init(struct cpu_info *ci) lcr4(cr4 & ~CR4_PGE); lcr4(cr4); - /* Synchronize TSC */ + /* Check if TSC is synchronized. */ if (cold && !CPU_IS_PRIMARY(ci)) - tsc_sync_ap(ci); + tsc_test_sync_ap(ci); #endif } @@ -854,18 +854,14 @@ cpu_start_secondary(struct cpu_info *ci) #endif } else { /* - * Synchronize time stamp counters. Invalidate cache and - * synchronize twice (in tsc_sync_bp) to minimize possible - * cache effects. Disable interrupts to try and rule out any - * external interference. + * Test if TSCs are synchronized. Invalidate cache to + * minimize possible cache effects. Disable interrupts to + * try to rule out external interference. */ s = intr_disable(); wbinvd(); - tsc_sync_bp(ci); + tsc_test_sync_bp(ci); intr_restore(s); -#ifdef TSC_DEBUG - printf("TSC skew=%lld\n", (long long)ci->ci_tsc_skew); -#endif } if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) { @@ -890,7 +886,6 @@ void cpu_boot_secondary(struct cpu_info *ci) { int i; - int64_t drift; u_long s; atomic_setbits_int(&ci->ci_flags, CPUF_GO); @@ -905,18 +900,11 @@ cpu_boot_secondary(struct cpu_info *ci) db_enter(); #endif } else if (cold) { - /* Synchronize TSC again, check for drift. */ - drift = ci->ci_tsc_skew; + /* Test if TSCs are synchronized again. */ s = intr_disable(); wbinvd(); - tsc_sync_bp(ci); + tsc_test_sync_bp(ci); intr_restore(s); - drift -= ci->ci_tsc_skew; -#ifdef TSC_DEBUG - printf("TSC skew=%lld drift=%lld\n", - (long long)ci->ci_tsc_skew, (long long)drift); -#endif - tsc_sync_drift(drift); } } @@ -942,13 +930,12 @@ cpu_hatch(void *v) #endif /* - * Synchronize the TSC for the first time. Note that interrupts are - * off at this point. + * Test if our TSC is synchronized for the first time. + * Note that interrupts are off at this point. */ wbinvd(); ci->ci_flags |= CPUF_PRESENT; - ci->ci_tsc_skew = 0; /* reset on resume */ - tsc_sync_ap(ci); + tsc_test_sync_ap(ci); lapic_enable(); lapic_startclock(); Index: sys/arch/amd64/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v retrieving revision 1.145 diff -u -p -r1.145 cpu.h --- sys/arch/amd64/include/cpu.h 12 Jul 2022 04:46:00 -0000 1.145 +++ sys/arch/amd64/include/cpu.h 29 Jul 2022 01:06:18 -0000 @@ -207,8 +207,6 @@ struct cpu_info { paddr_t ci_vmxon_region_pa; struct vmxon_region *ci_vmxon_region; - int64_t ci_tsc_skew; /* counter skew vs cpu0 */ - char ci_panicbuf[512]; paddr_t ci_vmcs_pa; @@ -228,7 +226,6 @@ struct cpu_info { #define CPUF_INVAR_TSC 0x0100 /* CPU has invariant TSC */ #define CPUF_USERXSTATE 0x0200 /* CPU has curproc's xsave state */ -#define CPUF_SYNCTSC 0x0800 /* Synchronize TSC */ #define CPUF_PRESENT 0x1000 /* CPU is present */ #define CPUF_RUNNING 0x2000 /* CPU is running */ #define CPUF_PAUSE 0x4000 /* CPU is paused in DDB */ Index: sys/arch/amd64/include/cpuvar.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/cpuvar.h,v retrieving revision 1.11 diff -u -p -r1.11 cpuvar.h --- sys/arch/amd64/include/cpuvar.h 16 May 2021 04:33:05 -0000 1.11 +++ sys/arch/amd64/include/cpuvar.h 29 Jul 2022 01:06:18 -0000 @@ -97,8 +97,7 @@ void identifycpu(struct cpu_info *); void cpu_init(struct cpu_info *); void cpu_init_first(void); -void tsc_sync_drift(int64_t); -void tsc_sync_bp(struct cpu_info *); -void tsc_sync_ap(struct cpu_info *); +void tsc_test_sync_bp(struct cpu_info *); +void tsc_test_sync_ap(struct cpu_info *); #endif Index: sys/sys/timetc.h =================================================================== RCS file: /cvs/src/sys/sys/timetc.h,v retrieving revision 1.12 diff -u -p -r1.12 timetc.h --- sys/sys/timetc.h 6 Jul 2020 13:33:09 -0000 1.12 +++ sys/sys/timetc.h 29 Jul 2022 01:06:18 -0000 @@ -120,6 +120,7 @@ extern struct timekeep *timekeep; u_int64_t tc_getfrequency(void); u_int64_t tc_getprecision(void); void tc_init(struct timecounter *tc); +void tc_detach(struct timecounter *); void tc_setclock(const struct timespec *ts); void tc_setrealtimeclock(const struct timespec *ts); void tc_ticktock(void); Index: sys/kern/kern_tc.c =================================================================== RCS file: /cvs/src/sys/kern/kern_tc.c,v retrieving revision 1.76 diff -u -p -r1.76 kern_tc.c --- sys/kern/kern_tc.c 23 Jul 2022 22:58:51 -0000 1.76 +++ sys/kern/kern_tc.c 29 Jul 2022 01:06:18 -0000 @@ -458,6 +458,36 @@ tc_init(struct timecounter *tc) timecounter = tc; } +/* + * Remove the given timecounter from the list of counters. + * Activate the best remaining counter if it was the active + * counter. + */ +void +tc_detach(struct timecounter *tc) +{ + struct timecounter *best = &dummy_timecounter, *tmp; + + if (tc == &dummy_timecounter) + panic("%s: cannot detach dummy counter", __func__); + + SLIST_REMOVE(&tc_list, tc, timecounter, tc_next); + if (timecounter == tc) { + SLIST_FOREACH(tmp, &tc_list, tc_next) { + if (tmp->tc_quality < 0) + continue; + if (tmp->tc_quality < best->tc_quality) + continue; + if (tmp->tc_quality == best->tc_quality && + tmp->tc_frequency < best->tc_frequency) + continue; + best = tmp; + } + enqueue_randomness(best->tc_get_timecount(best)); + timecounter = best; + } +} + /* Report the frequency of the current timecounter. */ u_int64_t tc_getfrequency(void)