Hi, Scott.

Recently I started running OpenBSD on ESXi.
I'm facing monotonic time going back problem as same as Yasuoka-san's report.

https://marc.info/?l=openbsd-tech&m=161657532610882&w=2

I've tried your v2 patch. It seems the problem has been solved in my enviroment.
But I'm a little bit confused about the patch. May I ask you goal of the patch?
Is it indented to fix the problem or is it for collecting test cases and the 
results?

Here is the dmesg shown in my patched environment.

OpenBSD 7.1-current (GENERIC.MP) #2: Mon May  2 12:03:22 JST 2022
    [email protected]:/usr/src/sys/arch/amd64/compile/GENERIC.MP
real mem = 2129526784 (2030MB)
avail mem = 2047713280 (1952MB)
random: good seed from bootblocks
mpath0 at root
scsibus0 at mpath0: 256 targets
mainbus0 at root
bios0 at mainbus0: SMBIOS rev. 2.7 @ 0xe0010 (242 entries)
bios0: vendor Phoenix Technologies LTD version "6.00" date 12/12/2018
bios0: VMware, Inc. VMware Virtual Platform
acpi0 at bios0: ACPI 4.0
acpi0: sleep states S0 S1 S4 S5
acpi0: tables DSDT FACP BOOT APIC MCFG SRAT HPET WAET
acpi0: wakeup devices PCI0(S3) USB_(S1) P2P0(S3) S1F0(S3) S2F0(S3) S8F0(S3) 
S16F(S3) S17F(S3) S18F(S3) S22F(S3) S23F(S3) S24F(S3) S25F(S3) PE40(S3) 
S1F0(S3) PE50(S3) [...]
acpitimer0 at acpi0: 3579545 Hz, 24 bits
acpimadt0 at acpi0 addr 0xfee00000: PC-AT compat
cpu0 at mainbus0: apid 0 (boot processor)
cpu0: AMD Ryzen 7 5700G with Radeon Graphics, 3792.27 MHz, 19-50-00
cpu0: FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,FMA3,CX16,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,MMXX,FFXSR,PAGE1GB,RDTSCP,LONG,LAHF,EAPICSP,AMCR8,ABM,SSE4A,MASSE,3DNOWP,OSVW,ITSC,FSGSBASE,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,SHA,PKU,IBPB,XSAVEOPT,XSAVEC,XSAVES
cpu0: 32KB 64b/line 8-way I-cache, 32KB 64b/line 8-way D-cache, 512KB 64b/line 
8-way L2 cache
cpu0: ITLB 64 4KB entries fully associative, 64 4MB entries fully associative
cpu0: DTLB 64 4KB entries fully associative, 64 4MB entries fully associative
cpu0: smt 0, core 0, package 0
mtrr: Pentium Pro MTRR support, 8 var ranges, 88 fixed ranges
cpu0: apic clock running at 65MHz
cpu1 at mainbus0: apid 2 (application processor)
cpu1: AMD Ryzen 7 5700G with Radeon Graphics, 3792.01 MHz, 19-50-00
cpu1: FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,FMA3,CX16,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,MMXX,FFXSR,PAGE1GB,RDTSCP,LONG,LAHF,EAPICSP,AMCR8,ABM,SSE4A,MASSE,3DNOWP,OSVW,ITSC,FSGSBASE,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,SHA,PKU,IBPB,XSAVEOPT,XSAVEC,XSAVES
cpu1: 32KB 64b/line 8-way I-cache, 32KB 64b/line 8-way D-cache, 512KB 64b/line 
8-way L2 cache
cpu1: ITLB 64 4KB entries fully associative, 64 4MB entries fully associative
cpu1: DTLB 64 4KB entries fully associative, 64 4MB entries fully associative
cpu1: smt 0, core 0, package 2
ioapic0 at mainbus0: apid 1 pa 0xfec00000, version 20, 24 pins
acpimcfg0 at acpi0
acpimcfg0: addr 0xf0000000, bus 0-127
acpihpet0 at acpi0: 14318179 Hz
acpiprt0 at acpi0: bus 0 (PCI0)
acpipci0 at acpi0 PCI0: 0x00000000 0x00000011 0x00000001
acpicmos0 at acpi0
"PNP0A05" at acpi0 not configured
acpiac0 at acpi0: AC unit online
acpicpu0 at acpi0: C1(@1 halt!)
acpicpu1 at acpi0: C1(@1 halt!)
pvbus0 at mainbus0: VMware
vmt0 at pvbus0
pci0 at mainbus0 bus 0
pchb0 at pci0 dev 0 function 0 "Intel 82443BX AGP" rev 0x01
ppb0 at pci0 dev 1 function 0 "Intel 82443BX AGP" rev 0x01
pci1 at ppb0 bus 1
pcib0 at pci0 dev 7 function 0 "Intel 82371AB PIIX4 ISA" rev 0x08
pciide0 at pci0 dev 7 function 1 "Intel 82371AB IDE" rev 0x01: DMA, channel 0 
configured to compatibility, channel 1 configured to compatibility
pciide0: channel 0 disabled (no drives)
pciide0: channel 1 disabled (no drives)
piixpm0 at pci0 dev 7 function 3 "Intel 82371AB Power" rev 0x08: SMBus disabled
"VMware VMCI" rev 0x10 at pci0 dev 7 function 7 not configured
vga1 at pci0 dev 15 function 0 "VMware SVGA II" rev 0x00
wsdisplay0 at vga1 mux 1: console (80x25, vt100 emulation)
wsdisplay0: screen 1-5 added (80x25, vt100 emulation)
mpi0 at pci0 dev 16 function 0 "Symbios Logic 53c1030" rev 0x01: apic 1 int 17
mpi0: 0, firmware 1.3.41.32
scsibus1 at mpi0: 16 targets, initiator 7
sd0 at scsibus1 targ 0 lun 0: <VMware, Virtual disk, 2.0>
sd0: 40960MB, 512 bytes/sector, 83886080 sectors
mpi0: target 0 Sync at 160MHz width 16bit offset 127 QAS 1 DT 1 IU 1
ppb1 at pci0 dev 17 function 0 "VMware PCI" rev 0x02
pci2 at ppb1 bus 2
uhci0 at pci2 dev 0 function 0 "VMware UHCI" rev 0x00: apic 1 int 18
ehci0 at pci2 dev 1 function 0 "VMware EHCI" rev 0x00: apic 1 int 19
usb0 at ehci0: USB revision 2.0
uhub0 at usb0 configuration 1 interface 0 "VMware EHCI root hub" rev 2.00/1.00 
addr 1
ahci0 at pci2 dev 3 function 0 "VMware AHCI" rev 0x00: msi, AHCI 1.3
ahci0: port 0: 6.0Gb/s
scsibus2 at ahci0: 32 targets
cd0 at scsibus2 targ 0 lun 0: <NECVMWar, VMware SATA CD00, 1.00> removable
usb1 at uhci0: USB revision 1.0
uhub1 at usb1 configuration 1 interface 0 "VMware UHCI root hub" rev 1.00/1.00 
addr 1
ppb2 at pci0 dev 21 function 0 "VMware PCIE" rev 0x01: msi
pci3 at ppb2 bus 3
em0 at pci3 dev 0 function 0 "Intel 82574L" rev 0x00: msi, address 
00:0c:29:f4:03:1d
ppb3 at pci0 dev 21 function 1 "VMware PCIE" rev 0x01: msi
pci4 at ppb3 bus 4
ppb4 at pci0 dev 21 function 2 "VMware PCIE" rev 0x01: msi
pci5 at ppb4 bus 5
ppb5 at pci0 dev 21 function 3 "VMware PCIE" rev 0x01: msi
pci6 at ppb5 bus 6
ppb6 at pci0 dev 21 function 4 "VMware PCIE" rev 0x01: msi
pci7 at ppb6 bus 7
ppb7 at pci0 dev 21 function 5 "VMware PCIE" rev 0x01: msi
pci8 at ppb7 bus 8
ppb8 at pci0 dev 21 function 6 "VMware PCIE" rev 0x01: msi
pci9 at ppb8 bus 9
ppb9 at pci0 dev 21 function 7 "VMware PCIE" rev 0x01: msi
pci10 at ppb9 bus 10
ppb10 at pci0 dev 22 function 0 "VMware PCIE" rev 0x01: msi
pci11 at ppb10 bus 11
ppb11 at pci0 dev 22 function 1 "VMware PCIE" rev 0x01: msi
pci12 at ppb11 bus 12
ppb12 at pci0 dev 22 function 2 "VMware PCIE" rev 0x01: msi
pci13 at ppb12 bus 13
ppb13 at pci0 dev 22 function 3 "VMware PCIE" rev 0x01: msi
pci14 at ppb13 bus 14
ppb14 at pci0 dev 22 function 4 "VMware PCIE" rev 0x01: msi
pci15 at ppb14 bus 15
ppb15 at pci0 dev 22 function 5 "VMware PCIE" rev 0x01: msi
pci16 at ppb15 bus 16
ppb16 at pci0 dev 22 function 6 "VMware PCIE" rev 0x01: msi
pci17 at ppb16 bus 17
ppb17 at pci0 dev 22 function 7 "VMware PCIE" rev 0x01: msi
pci18 at ppb17 bus 18
ppb18 at pci0 dev 23 function 0 "VMware PCIE" rev 0x01: msi
pci19 at ppb18 bus 19
ppb19 at pci0 dev 23 function 1 "VMware PCIE" rev 0x01: msi
pci20 at ppb19 bus 20
ppb20 at pci0 dev 23 function 2 "VMware PCIE" rev 0x01: msi
pci21 at ppb20 bus 21
ppb21 at pci0 dev 23 function 3 "VMware PCIE" rev 0x01: msi
pci22 at ppb21 bus 22
ppb22 at pci0 dev 23 function 4 "VMware PCIE" rev 0x01: msi
pci23 at ppb22 bus 23
ppb23 at pci0 dev 23 function 5 "VMware PCIE" rev 0x01: msi
pci24 at ppb23 bus 24
ppb24 at pci0 dev 23 function 6 "VMware PCIE" rev 0x01: msi
pci25 at ppb24 bus 25
ppb25 at pci0 dev 23 function 7 "VMware PCIE" rev 0x01: msi
pci26 at ppb25 bus 26
ppb26 at pci0 dev 24 function 0 "VMware PCIE" rev 0x01: msi
pci27 at ppb26 bus 27
ppb27 at pci0 dev 24 function 1 "VMware PCIE" rev 0x01: msi
pci28 at ppb27 bus 28
ppb28 at pci0 dev 24 function 2 "VMware PCIE" rev 0x01: msi
pci29 at ppb28 bus 29
ppb29 at pci0 dev 24 function 3 "VMware PCIE" rev 0x01: msi
pci30 at ppb29 bus 30
ppb30 at pci0 dev 24 function 4 "VMware PCIE" rev 0x01: msi
pci31 at ppb30 bus 31
ppb31 at pci0 dev 24 function 5 "VMware PCIE" rev 0x01: msi
pci32 at ppb31 bus 32
ppb32 at pci0 dev 24 function 6 "VMware PCIE" rev 0x01: msi
pci33 at ppb32 bus 33
ppb33 at pci0 dev 24 function 7 "VMware PCIE" rev 0x01: msi
pci34 at ppb33 bus 34
isa0 at pcib0
isadma0 at isa0
pckbc0 at isa0 port 0x60/5 irq 1 irq 12
pckbd0 at pckbc0 (kbd slot)
wskbd0 at pckbd0: console keyboard, using wsdisplay0
pms0 at pckbc0 (aux slot)
wsmouse0 at pms0 mux 0
pcppi0 at isa0 port 0x61
spkr0 at pcppi0
uhidev0 at uhub1 port 1 configuration 1 interface 0 "VMware VMware Virtual USB 
Mouse" rev 1.10/1.02 addr 2
uhidev0: iclass 3/0
ums0 at uhidev0: 16 buttons, Z and W dir
wsmouse1 at ums0 mux 0
uhidev1 at uhub1 port 1 configuration 1 interface 1 "VMware VMware Virtual USB 
Mouse" rev 1.10/1.02 addr 2
uhidev1: iclass 3/0
ums1 at uhidev1: 16 buttons, Z and W dir
wsmouse2 at ums1 mux 0
uhub2 at uhub1 port 2 configuration 1 interface 0 "VMware VMware Virtual USB 
Hub" rev 1.10/1.00 addr 3
vscsi0 at root
scsibus3 at vscsi0: 256 targets
softraid0 at root
scsibus4 at softraid0: 256 targets
root on sd0a (44187a6f5c0fd3d3.a) swap on sd0b dump on sd0b

On 2/23/22 11:58, Scott Cheloha wrote:
Hi,

Here is a second draft patch for changing our approach to TSC
synchronization.

With this patch, instead of trying to fix desync with a handshake we
test for desync with a (more) foolproof loop and then don't attempt to
correct for desync if we detect it.

The motivation for a more foolproof loop is to eliminate the false
positive results seen on multisocket CPUs using the current handshake.
The handshake seems to interpret NUMA lag as desync and disables the
TSC on some multisocket systems.  Ideally this should not happen.

The motivation for not attempting to fix desync in the kernel with a
per-CPU skew value is, well, I think it's too error-prone.  I think
reliably correcting TSC desync in software without the IA32_TSC_ADJUST
register is basically impossible given the speed of the TSC.  If it
were a slower clock it would be more feasible, but this is not the
case.

One thing that doesn't work correctly yet is resetting
IA32_TSC_ADJUST.  The relevant feature flag is missing during the
first boot.  Could we move CPU identification up to an earlier point
in cpu_hatch() so the flag is set when we do the sync test?

--

I asked for review on the actual meat of the previous patch and got
nothing.  So I have dumbed the code down to weed out confounding
factors.

I am relatively confident that if this test detects desync that
something is off with your TSC.  Not totally confident, because I
haven't gotten any review yet, but I'm getting more confident.

When I forcibly desync the TSC on my APs during boot by 150 cycles
using the IA32_TSC_ADJUST register I get output like this:

Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu1: sync test round 1/2 failed
Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu1: cpu1: 947 lags 32 cycles
Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu2: sync test round 1/2 failed
Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu2: cpu2: 1215 lags 30 cycles
Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu3: sync test round 1/2 failed
Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu3: cpu3: 1085 lags 28 cycles
Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu4: sync test round 1/2 failed
Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu4: cpu4: 18667 lags 94 cycles
Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu5: sync test round 1/2 failed
Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu5: cpu5: 771 lags 34 cycles
Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu6: sync test round 1/2 failed
Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu6: cpu6: 842 lags 30 cycles
Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu7: sync test round 1/2 failed
Feb 21 10:05:14 jetsam /bsd: tsc: cpu0/cpu7: cpu7: 895 lags 30 cycles

The CPUs are not running at full speed during boot, so we only measure
~30 cycles of lag when the true lag is 150 cycles.  We get ~94 cycles
on CPU4 because it is an SMT twin with CPU0... for some reason this
reduces the margin of error.  In general, the margin of error shrinks
with higher clock rates.

Please test!  In particular:

- I'd love retests on systems that failed the test using the previous
   patch.  Almost all of these were AMD Ryzen CPUs.  It's hard to say
   what the issue is there.  My vague guess is a firmware bug.

   One would hope that AMD's QA would catch an issue with the #RESET
   signal, which is supposed to start all TSCs on all CPUs from zero
   simultaneously.  I am unsure how you would diagnose that it was,
   in fact, a firmware bug though.

- Multisocket systems

- Multiprocessor VMs

Please include your dmesg.

Thanks!

-Scott

Index: amd64/tsc.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v
retrieving revision 1.24
diff -u -p -r1.24 tsc.c
--- amd64/tsc.c 31 Aug 2021 15:11:54 -0000      1.24
+++ amd64/tsc.c 23 Feb 2022 02:23:24 -0000
@@ -36,13 +36,6 @@ int          tsc_recalibrate;
  uint64_t      tsc_frequency;
  int           tsc_is_invariant;
-#define TSC_DRIFT_MAX 250
-#define TSC_SKEW_MAX                   100
-int64_t        tsc_drift_observed;
-
-volatile int64_t       tsc_sync_val;
-volatile struct cpu_info       *tsc_sync_cpu;
-
  u_int         tsc_get_timecount(struct timecounter *tc);
  void          tsc_delay(int usecs);
@@ -236,22 +229,12 @@ cpu_recalibrate_tsc(struct timecounter *
  u_int
  tsc_get_timecount(struct timecounter *tc)
  {
-       return rdtsc_lfence() + curcpu()->ci_tsc_skew;
+       return rdtsc_lfence();
  }
void
  tsc_timecounter_init(struct cpu_info *ci, uint64_t cpufreq)
  {
-#ifdef TSC_DEBUG
-       printf("%s: TSC skew=%lld observed drift=%lld\n", ci->ci_dev->dv_xname,
-           (long long)ci->ci_tsc_skew, (long long)tsc_drift_observed);
-#endif
-       if (ci->ci_tsc_skew < -TSC_SKEW_MAX || ci->ci_tsc_skew > TSC_SKEW_MAX) {
-               printf("%s: disabling user TSC (skew=%lld)\n",
-                   ci->ci_dev->dv_xname, (long long)ci->ci_tsc_skew);
-               tsc_timecounter.tc_user = 0;
-       }
-
        if (!(ci->ci_flags & CPUF_PRIMARY) ||
            !(ci->ci_flags & CPUF_CONST_TSC) ||
            !(ci->ci_flags & CPUF_INVAR_TSC))
@@ -268,111 +251,252 @@ tsc_timecounter_init(struct cpu_info *ci
                calibrate_tsc_freq();
        }
- if (tsc_drift_observed > TSC_DRIFT_MAX) {
-               printf("ERROR: %lld cycle TSC drift observed\n",
-                   (long long)tsc_drift_observed);
-               tsc_timecounter.tc_quality = -1000;
-               tsc_timecounter.tc_user = 0;
-               tsc_is_invariant = 0;
-       }
-
        tc_init(&tsc_timecounter);
  }
-/*
- * Record drift (in clock cycles).  Called during AP startup.
- */
  void
-tsc_sync_drift(int64_t drift)
+tsc_delay(int usecs)
  {
-       if (drift < 0)
-               drift = -drift;
-       if (drift > tsc_drift_observed)
-               tsc_drift_observed = drift;
+       uint64_t interval, start;
+
+       interval = (uint64_t)usecs * tsc_frequency / 1000000;
+       start = rdtsc_lfence();
+       while (rdtsc_lfence() - start < interval)
+               CPU_BUSY_CYCLE();
  }
+#ifdef MULTIPROCESSOR
+
+#define TSC_DEBUG 1
+
  /*
- * Called during startup of APs, by the boot processor.  Interrupts
- * are disabled on entry.
+ * Protections for global variables in this code:
+ *
+ *     a       Modified atomically
+ *     b       Protected by a barrier
+ *     p       Only modified by the primary CPU
   */
-void
-tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp)
-{
-       uint64_t bptsc;
- if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL)
-               panic("tsc_sync_bp: 1");
+#define TSC_TEST_MS            1       /* Test round duration */
+#define TSC_TEST_ROUNDS                2       /* Number of test rounds */
- /* Flag it and read our TSC. */
-       atomic_setbits_int(&ci->ci_flags, CPUF_SYNCTSC);
-       bptsc = (rdtsc_lfence() >> 1);
+struct tsc_test_status {
+       volatile uint64_t val __aligned(64);    /* [b] CPU's latest TSC value */
+       uint64_t lag_count;                     /* [b] # of lags seen by CPU */
+       uint64_t lag_max;                       /* [b] Biggest lag seen */
+};
+struct tsc_test_status tsc_ap_status;  /* [b] Test results from AP */
+struct tsc_test_status tsc_bp_status;  /* [p] Test results from BP */
+uint64_t tsc_test_cycles;              /* [p] TSC cycles per test round */
+const char *tsc_ap_name;               /* [b] Name of AP */
+volatile u_int tsc_egress_barrier;     /* [a] Test end barrier */
+volatile u_int tsc_ingress_barrier;    /* [a] Test start barrier */
+volatile u_int tsc_test_rounds;                /* [p] Remaining test rounds */
+int tsc_is_synchronized = 1;           /* [p] TSC sync'd across all CPUs? */
+
+void tsc_report_test_failure(void);
+void tsc_reset_adjust(const struct cpu_info *);
+void tsc_test_ap(void);
+void tsc_test_bp(void);
- /* Wait for remote to complete, and read ours again. */
-       while ((ci->ci_flags & CPUF_SYNCTSC) != 0)
-               membar_consumer();
-       bptsc += (rdtsc_lfence() >> 1);
+void
+tsc_test_sync_bp(struct cpu_info *ci)
+{
+       /* TSC must be constant and invariant to use it as a timecounter. */
+       if (!tsc_is_invariant)
+               return;
- /* Wait for the results to come in. */
-       while (tsc_sync_cpu == ci)
-               CPU_BUSY_CYCLE();
-       if (tsc_sync_cpu != NULL)
-               panic("tsc_sync_bp: 2");
+       /* Reset IA32_TSC_ADJUST if it exists. */
+       tsc_reset_adjust(ci);
- *bptscp = bptsc;
-       *aptscp = tsc_sync_val;
+       /* Reset the test cycle limit and round count. */
+       tsc_test_cycles = TSC_TEST_MS * tsc_frequency / 1000;
+       tsc_test_rounds = TSC_TEST_ROUNDS;
+
+       do {
+               /*
+                * Pass through the ingress barrier, run the test,
+                * then wait for the AP to reach the egress barrier.
+                */
+               atomic_inc_int(&tsc_ingress_barrier);
+               while (tsc_ingress_barrier != 2)
+                       CPU_BUSY_CYCLE();
+               tsc_test_bp();
+               while (tsc_egress_barrier != 1)
+                       CPU_BUSY_CYCLE();
+
+               /*
+                * Report what went wrong, if anything.
+                */
+               if (tsc_ap_status.lag_count || tsc_bp_status.lag_count) {
+                       tsc_report_test_failure();
+                       /*
+                        * XXX We need a tc_detach() function to actually
+                        * disable a given timecounter.  Lowering the quality
+                        * like this is a nasty hack.
+                        */
+                       tsc_timecounter.tc_quality = -1000;
+                       tsc_is_synchronized = 0;
+                       tsc_test_rounds = 0;
+               } else
+                       tsc_test_rounds--;
+
+               /*
+                * Clean up for the next round.
+                *
+                * It is safe to reset the ingress barrier because
+                * at this point we know the AP has reached the egress
+                * barrier.
+                */
+               memset(&tsc_ap_status, 0, sizeof tsc_ap_status);
+               memset(&tsc_bp_status, 0, sizeof tsc_bp_status);
+               tsc_ingress_barrier = 0;
+               if (tsc_test_rounds == 0)
+                       tsc_ap_name = NULL;
+
+               /*
+                * Pass through the egress barrier and release the AP.
+                * The AP is responsible for resetting the egress barrier.
+                */
+               if (atomic_inc_int_nv(&tsc_egress_barrier) != 2)
+                       panic("%s: unexpected egress count", __func__);
+       } while (tsc_test_rounds > 0);
  }
void
-tsc_sync_bp(struct cpu_info *ci)
+tsc_test_sync_ap(struct cpu_info *ci)
  {
-       uint64_t bptsc, aptsc;
+       if (!tsc_is_invariant)
+               return;
- tsc_read_bp(ci, &bptsc, &aptsc); /* discarded - cache effects */
-       tsc_read_bp(ci, &bptsc, &aptsc);
+       tsc_reset_adjust(ci);
- /* Compute final value to adjust for skew. */
-       ci->ci_tsc_skew = bptsc - aptsc;
+/* #define TSC_FORCE_DESYNC (-150) */
+#ifdef TSC_FORCE_DESYNC
+       wrmsr(MSR_TSC_ADJUST, TSC_FORCE_DESYNC);
+#endif
+
+       /* The BP needs our name in order to log any problems. */
+       tsc_ap_name = ci->ci_dev->dv_xname;
+
+       /*
+        * The AP is only responsible for running the test and
+        * resetting the egress barrier.  The BP sets up and tears
+        * down everything else.
+        */
+       do {
+               atomic_inc_int(&tsc_ingress_barrier);
+               while (tsc_ingress_barrier != 2)
+                       CPU_BUSY_CYCLE();
+               tsc_test_ap();
+               atomic_inc_int(&tsc_egress_barrier);
+               while (atomic_cas_uint(&tsc_egress_barrier, 2, 0) != 2)
+                       CPU_BUSY_CYCLE();
+       } while (tsc_test_rounds > 0);
  }
-/*
- * Called during startup of AP, by the AP itself.  Interrupts are
- * disabled on entry.
- */
  void
-tsc_post_ap(struct cpu_info *ci)
+tsc_report_test_failure(void)
  {
-       uint64_t tsc;
-
-       /* Wait for go-ahead from primary. */
-       while ((ci->ci_flags & CPUF_SYNCTSC) == 0)
-               membar_consumer();
-       tsc = (rdtsc_lfence() >> 1);
+       u_int round = TSC_TEST_ROUNDS - tsc_test_rounds + 1;
- /* Instruct primary to read its counter. */
-       atomic_clearbits_int(&ci->ci_flags, CPUF_SYNCTSC);
-       tsc += (rdtsc_lfence() >> 1);
+       printf("tsc: cpu0/%s: sync test round %u/%u failed\n",
+           tsc_ap_name, round, TSC_TEST_ROUNDS);
+       if (tsc_bp_status.lag_count > 0) {
+               printf("tsc: cpu0/%s: cpu0: %llu lags %llu cycles\n",
+                   tsc_ap_name, tsc_bp_status.lag_count,
+                   tsc_bp_status.lag_max);
+       }
+       if (tsc_ap_status.lag_count > 0) {
+               printf("tsc: cpu0/%s: %s: %llu lags %llu cycles\n",
+                   tsc_ap_name, tsc_ap_name, tsc_ap_status.lag_count,
+                   tsc_ap_status.lag_max);
+       }
+}
- /* Post result. Ensure the whole value goes out atomically. */
-       (void)atomic_swap_64(&tsc_sync_val, tsc);
+void
+tsc_reset_adjust(const struct cpu_info *ci)
+{
+       int64_t adj;
- if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci)
-               panic("tsc_sync_ap");
+       if (ISSET(ci->ci_feature_sefflags_ebx, SEFF0EBX_TSC_ADJUST)) {
+               adj = rdmsr(MSR_TSC_ADJUST);
+               if (adj != 0) {
+                       wrmsr(MSR_TSC_ADJUST, 0);
+#if 0
+                       /* XXX APs can't printf during boot or resume? */
+                       printf("tsc: %s: IA32_TSC_ADJUST: %+lld -> 0",
+                           ci->ci_dev->dv_xname, (long long)adj);
+#endif
+               }
+       }
  }
void
-tsc_sync_ap(struct cpu_info *ci)
+tsc_test_ap(void)
  {
-       tsc_post_ap(ci);
-       tsc_post_ap(ci);
+       uint64_t ap_val, bp_val, end, lag;
+       u_int i = 0;
+
+       ap_val = rdtsc_lfence();
+       end = ap_val + tsc_test_cycles;
+       while (ap_val < end) {
+               /*
+                * The LFENCE ensures bp_val predates ap_val.  If ap_val
+                * is smaller than bp_val then the AP's TSC must lag that
+                * of the BP, i.e. the two are not synchronized.
+                */
+               bp_val = tsc_bp_status.val;
+               ap_val = rdtsc_lfence();
+               tsc_ap_status.val = ap_val;
+
+               /*
+                * Ensure the other CPU has a chance to run.  This is
+                * crucial if the other CPU is our SMT sibling.  SMT
+                * starvation can prevent this test from detecting
+                * relatively large lags.  Eight is an arbitrary value,
+                * but it seems to work in practice without impacting
+                * the sensitivity of the test for non-sibling threads.
+                */
+               if (++i % 8 == 0)
+                       CPU_BUSY_CYCLE();
+
+               /*
+                * Record the magnitude of the problem if our TSC lags
+                * the other.
+                */
+               if (__predict_false(ap_val < bp_val)) {
+                       tsc_ap_status.lag_count++;
+                       lag = bp_val - ap_val;
+                       if (tsc_ap_status.lag_max < lag)
+                               tsc_ap_status.lag_max = lag;
+               }
+       }
  }
void
-tsc_delay(int usecs)
+tsc_test_bp(void)
  {
-       uint64_t interval, start;
+       uint64_t ap_val, bp_val, end, lag;
+       u_int i = 0;
- interval = (uint64_t)usecs * tsc_frequency / 1000000;
-       start = rdtsc_lfence();
-       while (rdtsc_lfence() - start < interval)
-               CPU_BUSY_CYCLE();
+       bp_val = rdtsc_lfence();
+       end = bp_val + tsc_test_cycles;
+       while (bp_val < end) {
+               ap_val = tsc_ap_status.val;
+               bp_val = rdtsc_lfence();
+               tsc_bp_status.val = bp_val;
+
+               if (++i % 8 == 0)
+                       CPU_BUSY_CYCLE();
+
+               if (__predict_false(bp_val < ap_val)) {
+                       tsc_bp_status.lag_count++;
+                       lag = ap_val - bp_val;
+                       if (tsc_bp_status.lag_max < lag)
+                               tsc_bp_status.lag_max = lag;
+               }
+       }
  }
+
+#endif /* MULTIPROCESSOR */
Index: amd64/cpu.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v
retrieving revision 1.155
diff -u -p -r1.155 cpu.c
--- amd64/cpu.c 21 Feb 2022 11:03:39 -0000      1.155
+++ amd64/cpu.c 23 Feb 2022 02:23:26 -0000
@@ -762,9 +762,9 @@ cpu_init(struct cpu_info *ci)
        lcr4(cr4 & ~CR4_PGE);
        lcr4(cr4);
- /* Synchronize TSC */
+       /* Check if our TSC is synchronized with the BP's TSC. */
        if (cold && !CPU_IS_PRIMARY(ci))
-             tsc_sync_ap(ci);
+             tsc_test_sync_ap(ci);
  #endif
  }
@@ -844,18 +844,14 @@ cpu_start_secondary(struct cpu_info *ci)
  #endif
        } else {
                /*
-                * Synchronize time stamp counters. Invalidate cache and
-                * synchronize twice (in tsc_sync_bp) to minimize possible
-                * cache effects. Disable interrupts to try and rule out any
-                * external interference.
+                * Check if our TSC is synchronized with the AP's TSC.
+                * Invalidate cache to minimize possible cache effects
+                * and disable interrupts to minimize test interference.
                 */
                s = intr_disable();
                wbinvd();
-               tsc_sync_bp(ci);
+               tsc_test_sync_bp(ci);
                intr_restore(s);
-#ifdef TSC_DEBUG
-               printf("TSC skew=%lld\n", (long long)ci->ci_tsc_skew);
-#endif
        }
if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) {
@@ -880,7 +876,6 @@ void
  cpu_boot_secondary(struct cpu_info *ci)
  {
        int i;
-       int64_t drift;
        u_long s;
atomic_setbits_int(&ci->ci_flags, CPUF_GO);
@@ -895,18 +890,14 @@ cpu_boot_secondary(struct cpu_info *ci)
                db_enter();
  #endif
        } else if (cold) {
-               /* Synchronize TSC again, check for drift. */
-               drift = ci->ci_tsc_skew;
+               /*
+                * Test TSC synchronization again.  Something may
+                * have screwed it up while we were down.
+                */
                s = intr_disable();
                wbinvd();
-               tsc_sync_bp(ci);
+               tsc_test_sync_bp(ci);
                intr_restore(s);
-               drift -= ci->ci_tsc_skew;
-#ifdef TSC_DEBUG
-               printf("TSC skew=%lld drift=%lld\n",
-                   (long long)ci->ci_tsc_skew, (long long)drift);
-#endif
-               tsc_sync_drift(drift);
        }
  }
@@ -932,16 +923,17 @@ cpu_hatch(void *v)
  #endif
/*
-        * Synchronize the TSC for the first time. Note that interrupts are
-        * off at this point.
+        * Test if our TSC is synchronized with the BP's TSC.
+        * Note that interrupts are off at this point.  Invalidate
+        * cache to minimize cache effects on the test.
         */
        wbinvd();
        ci->ci_flags |= CPUF_PRESENT;
-       ci->ci_tsc_skew = 0; /* reset on resume */
-       tsc_sync_ap(ci);
+       tsc_test_sync_ap(ci);
lapic_enable();
        lapic_startclock();
+
        cpu_ucode_apply(ci);
        cpu_tsx_disable(ci);
Index: include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v
retrieving revision 1.141
diff -u -p -r1.141 cpu.h
--- include/cpu.h       31 Aug 2021 17:40:59 -0000      1.141
+++ include/cpu.h       23 Feb 2022 02:23:26 -0000
@@ -209,8 +209,6 @@ struct cpu_info {
        paddr_t         ci_vmxon_region_pa;
        struct vmxon_region *ci_vmxon_region;
- int64_t ci_tsc_skew; /* counter skew vs cpu0 */
-
        char            ci_panicbuf[512];
paddr_t ci_vmcs_pa;
@@ -230,7 +228,6 @@ struct cpu_info {
  #define CPUF_INVAR_TSC        0x0100          /* CPU has invariant TSC */
  #define CPUF_USERXSTATE       0x0200          /* CPU has curproc's xsave 
state */
-#define CPUF_SYNCTSC 0x0800 /* Synchronize TSC */
  #define CPUF_PRESENT  0x1000          /* CPU is present */
  #define CPUF_RUNNING  0x2000          /* CPU is running */
  #define CPUF_PAUSE    0x4000          /* CPU is paused in DDB */
Index: include/cpuvar.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/cpuvar.h,v
retrieving revision 1.11
diff -u -p -r1.11 cpuvar.h
--- include/cpuvar.h    16 May 2021 04:33:05 -0000      1.11
+++ include/cpuvar.h    23 Feb 2022 02:23:26 -0000
@@ -97,8 +97,7 @@ void identifycpu(struct cpu_info *);
  void cpu_init(struct cpu_info *);
  void cpu_init_first(void);
-void tsc_sync_drift(int64_t);
-void tsc_sync_bp(struct cpu_info *);
-void tsc_sync_ap(struct cpu_info *);
+void tsc_test_sync_bp(struct cpu_info *);
+void tsc_test_sync_ap(struct cpu_info *);
#endif


--
Yuichiro NAITO ([email protected])

Reply via email to