Hi, Scott.
I tested v5 patch on my ESXi on Ryzen7.
It works fine for me.
$ sysctl -a | grep tsc
kern.timecounter.hardware=tsc
kern.timecounter.choice=i8254(0) acpihpet0(1000) tsc(2000)
acpitimer0(1000)
machdep.tscfreq=3593269150
machdep.invarianttsc=1
$ sysctl kern.timecounter
kern.timecounter.tick=1
kern.timecounter.timestepwarnings=0
kern.timecounter.hardware=tsc
kern.timecounter.choice=i8254(0) acpihpet0(1000) tsc(2000)
acpitimer0(1000)
$ dmesg | grep failed
$
dmesg:
OpenBSD 7.2-beta (GENERIC.MP) #2: Mon Aug 1 14:53:55 JST 2022
[email protected]:/usr/src/sys/arch/amd64/compile/GENERIC.MP
real mem = 17161912320 (16366MB)
avail mem = 16624406528 (15854MB)
random: good seed from bootblocks
mpath0 at root
scsibus0 at mpath0: 256 targets
mainbus0 at root
bios0 at mainbus0: SMBIOS rev. 2.7 @ 0xe0010 (260 entries)
bios0: vendor Phoenix Technologies LTD version "6.00" date 12/12/2018
bios0: VMware, Inc. VMware Virtual Platform
acpi0 at bios0: ACPI 4.0
acpi0: sleep states S0 S1 S4 S5
acpi0: tables DSDT FACP BOOT APIC MCFG SRAT HPET WAET
acpi0: wakeup devices PCI0(S3) USB_(S1) P2P0(S3) S1F0(S3) S2F0(S3) S8F0(S3)
S16F(S3) S17F(S3) S18F(S3) S22F(S3) S23F(S3) S24F(S3) S25F(S3) PE40(S3)
S1F0(S3) PE50(S3) [...]
acpitimer0 at acpi0: 3579545 Hz, 24 bits
acpimadt0 at acpi0 addr 0xfee00000: PC-AT compat
cpu0 at mainbus0: apid 0 (boot processor)
cpu0: AMD Ryzen 7 3700X 8-Core Processor, 3592.98 MHz, 17-71-00
cpu0:
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,FMA3,CX16,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,MMXX,FFXSR,PAGE1GB,RDTSCP,LONG,LAHF,EAPICSP,AMCR8,ABM,SSE4A,MASSE,3DNOWP,OSVW,ITSC,FSGSBASE,BMI1,AVX2,SMEP,BMI2,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,SHA,IBPB,XSAVEOPT,XSAVEC,XSAVES
cpu0: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache
cpu0: 512KB 64b/line 8-way L2 cache
cpu0: smt 0, core 0, package 0
mtrr: Pentium Pro MTRR support, 8 var ranges, 88 fixed ranges
cpu0: apic clock running at 65MHz
cpu1 at mainbus0: apid 2 (application processor)
cpu1: AMD Ryzen 7 3700X 8-Core Processor, 3592.75 MHz, 17-71-00
cpu1:
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,FMA3,CX16,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,MMXX,FFXSR,PAGE1GB,RDTSCP,LONG,LAHF,EAPICSP,AMCR8,ABM,SSE4A,MASSE,3DNOWP,OSVW,ITSC,FSGSBASE,BMI1,AVX2,SMEP,BMI2,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,SHA,IBPB,XSAVEOPT,XSAVEC,XSAVES
cpu1: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache
cpu1: 512KB 64b/line 8-way L2 cache
cpu1: smt 0, core 0, package 2
cpu2 at mainbus0: apid 4 (application processor)
cpu2: AMD Ryzen 7 3700X 8-Core Processor, 3592.75 MHz, 17-71-00
cpu2:
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,FMA3,CX16,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,MMXX,FFXSR,PAGE1GB,RDTSCP,LONG,LAHF,EAPICSP,AMCR8,ABM,SSE4A,MASSE,3DNOWP,OSVW,ITSC,FSGSBASE,BMI1,AVX2,SMEP,BMI2,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,SHA,IBPB,XSAVEOPT,XSAVEC,XSAVES
cpu2: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache
cpu2: 512KB 64b/line 8-way L2 cache
cpu2: smt 0, core 0, package 4
cpu3 at mainbus0: apid 6 (application processor)
cpu3: AMD Ryzen 7 3700X 8-Core Processor, 3592.75 MHz, 17-71-00
cpu3:
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,FMA3,CX16,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,MMXX,FFXSR,PAGE1GB,RDTSCP,LONG,LAHF,EAPICSP,AMCR8,ABM,SSE4A,MASSE,3DNOWP,OSVW,ITSC,FSGSBASE,BMI1,AVX2,SMEP,BMI2,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,SHA,IBPB,XSAVEOPT,XSAVEC,XSAVES
cpu3: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache
cpu3: 512KB 64b/line 8-way L2 cache
cpu3: smt 0, core 0, package 6
cpu4 at mainbus0: apid 8 (application processor)
cpu4: AMD Ryzen 7 3700X 8-Core Processor, 3592.75 MHz, 17-71-00
cpu4:
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,FMA3,CX16,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,MMXX,FFXSR,PAGE1GB,RDTSCP,LONG,LAHF,EAPICSP,AMCR8,ABM,SSE4A,MASSE,3DNOWP,OSVW,ITSC,FSGSBASE,BMI1,AVX2,SMEP,BMI2,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,SHA,IBPB,XSAVEOPT,XSAVEC,XSAVES
cpu4: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache
cpu4: 512KB 64b/line 8-way L2 cache
cpu4: smt 0, core 0, package 8
cpu5 at mainbus0: apid 10 (application processor)
cpu5: AMD Ryzen 7 3700X 8-Core Processor, 3592.75 MHz, 17-71-00
cpu5:
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,FMA3,CX16,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,MMXX,FFXSR,PAGE1GB,RDTSCP,LONG,LAHF,EAPICSP,AMCR8,ABM,SSE4A,MASSE,3DNOWP,OSVW,ITSC,FSGSBASE,BMI1,AVX2,SMEP,BMI2,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,SHA,IBPB,XSAVEOPT,XSAVEC,XSAVES
cpu5: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache
cpu5: 512KB 64b/line 8-way L2 cache
cpu5: smt 0, core 0, package 10
cpu6 at mainbus0: apid 12 (application processor)
cpu6: AMD Ryzen 7 3700X 8-Core Processor, 3592.75 MHz, 17-71-00
cpu6:
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,FMA3,CX16,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,MMXX,FFXSR,PAGE1GB,RDTSCP,LONG,LAHF,EAPICSP,AMCR8,ABM,SSE4A,MASSE,3DNOWP,OSVW,ITSC,FSGSBASE,BMI1,AVX2,SMEP,BMI2,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,SHA,IBPB,XSAVEOPT,XSAVEC,XSAVES
cpu6: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache
cpu6: 512KB 64b/line 8-way L2 cache
cpu6: smt 0, core 0, package 12
cpu7 at mainbus0: apid 14 (application processor)
cpu7: AMD Ryzen 7 3700X 8-Core Processor, 3592.76 MHz, 17-71-00
cpu7:
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,FMA3,CX16,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,MMXX,FFXSR,PAGE1GB,RDTSCP,LONG,LAHF,EAPICSP,AMCR8,ABM,SSE4A,MASSE,3DNOWP,OSVW,ITSC,FSGSBASE,BMI1,AVX2,SMEP,BMI2,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,SHA,IBPB,XSAVEOPT,XSAVEC,XSAVES
cpu7: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache
cpu7: 512KB 64b/line 8-way L2 cache
cpu7: smt 0, core 0, package 14
ioapic0 at mainbus0: apid 1 pa 0xfec00000, version 20, 24 pins
acpimcfg0 at acpi0
acpimcfg0: addr 0xf0000000, bus 0-127
acpihpet0 at acpi0: 14318179 Hz
acpiprt0 at acpi0: bus 0 (PCI0)
acpipci0 at acpi0 PCI0: 0x00000000 0x00000011 0x00000001
acpicmos0 at acpi0
"PNP0A05" at acpi0 not configured
acpiac0 at acpi0: AC unit online
acpicpu0 at acpi0: C1(@1 halt!)
acpicpu1 at acpi0: C1(@1 halt!)
acpicpu2 at acpi0: C1(@1 halt!)
acpicpu3 at acpi0: C1(@1 halt!)
acpicpu4 at acpi0: C1(@1 halt!)
acpicpu5 at acpi0: C1(@1 halt!)
acpicpu6 at acpi0: C1(@1 halt!)
acpicpu7 at acpi0: C1(@1 halt!)
pvbus0 at mainbus0: VMware
vmt0 at pvbus0
pci0 at mainbus0 bus 0
pchb0 at pci0 dev 0 function 0 "Intel 82443BX AGP" rev 0x01
ppb0 at pci0 dev 1 function 0 "Intel 82443BX AGP" rev 0x01
pci1 at ppb0 bus 1
pcib0 at pci0 dev 7 function 0 "Intel 82371AB PIIX4 ISA" rev 0x08
pciide0 at pci0 dev 7 function 1 "Intel 82371AB IDE" rev 0x01: DMA, channel 0
configured to compatibility, channel 1 configured to compatibility
pciide0: channel 0 disabled (no drives)
pciide0: channel 1 disabled (no drives)
piixpm0 at pci0 dev 7 function 3 "Intel 82371AB Power" rev 0x08: SMBus disabled
"VMware VMCI" rev 0x10 at pci0 dev 7 function 7 not configured
vga1 at pci0 dev 15 function 0 "VMware SVGA II" rev 0x00
wsdisplay0 at vga1 mux 1: console (80x25, vt100 emulation)
wsdisplay0: screen 1-5 added (80x25, vt100 emulation)
mpi0 at pci0 dev 16 function 0 "Symbios Logic 53c1030" rev 0x01: apic 1 int 17
mpi0: 0, firmware 1.3.41.32
scsibus1 at mpi0: 16 targets, initiator 7
sd0 at scsibus1 targ 0 lun 0: <VMware, Virtual disk, 2.0>
sd0: 32768MB, 512 bytes/sector, 67108864 sectors
mpi0: target 0 Sync at 160MHz width 16bit offset 127 QAS 1 DT 1 IU 1
ppb1 at pci0 dev 17 function 0 "VMware PCI" rev 0x02
pci2 at ppb1 bus 2
uhci0 at pci2 dev 0 function 0 "VMware UHCI" rev 0x00: apic 1 int 18
ehci0 at pci2 dev 1 function 0 "VMware EHCI" rev 0x00: apic 1 int 19
usb0 at ehci0: USB revision 2.0
uhub0 at usb0 configuration 1 interface 0 "VMware EHCI root hub" rev 2.00/1.00
addr 1
ahci0 at pci2 dev 3 function 0 "VMware AHCI" rev 0x00: msi, AHCI 1.3
ahci0: port 0: 6.0Gb/s
scsibus2 at ahci0: 32 targets
cd0 at scsibus2 targ 0 lun 0: <NECVMWar, VMware SATA CD00, 1.00> removable
usb1 at uhci0: USB revision 1.0
uhub1 at usb1 configuration 1 interface 0 "VMware UHCI root hub" rev 1.00/1.00
addr 1
ppb2 at pci0 dev 21 function 0 "VMware PCIE" rev 0x01: msi
pci3 at ppb2 bus 3
em0 at pci3 dev 0 function 0 "Intel 82574L" rev 0x00: msi, address
00:0c:29:df:6b:6f
ppb3 at pci0 dev 21 function 1 "VMware PCIE" rev 0x01: msi
pci4 at ppb3 bus 4
ppb4 at pci0 dev 21 function 2 "VMware PCIE" rev 0x01: msi
pci5 at ppb4 bus 5
ppb5 at pci0 dev 21 function 3 "VMware PCIE" rev 0x01: msi
pci6 at ppb5 bus 6
ppb6 at pci0 dev 21 function 4 "VMware PCIE" rev 0x01: msi
pci7 at ppb6 bus 7
ppb7 at pci0 dev 21 function 5 "VMware PCIE" rev 0x01: msi
pci8 at ppb7 bus 8
ppb8 at pci0 dev 21 function 6 "VMware PCIE" rev 0x01: msi
pci9 at ppb8 bus 9
ppb9 at pci0 dev 21 function 7 "VMware PCIE" rev 0x01: msi
pci10 at ppb9 bus 10
ppb10 at pci0 dev 22 function 0 "VMware PCIE" rev 0x01: msi
pci11 at ppb10 bus 11
ppb11 at pci0 dev 22 function 1 "VMware PCIE" rev 0x01: msi
pci12 at ppb11 bus 12
ppb12 at pci0 dev 22 function 2 "VMware PCIE" rev 0x01: msi
pci13 at ppb12 bus 13
ppb13 at pci0 dev 22 function 3 "VMware PCIE" rev 0x01: msi
pci14 at ppb13 bus 14
ppb14 at pci0 dev 22 function 4 "VMware PCIE" rev 0x01: msi
pci15 at ppb14 bus 15
ppb15 at pci0 dev 22 function 5 "VMware PCIE" rev 0x01: msi
pci16 at ppb15 bus 16
ppb16 at pci0 dev 22 function 6 "VMware PCIE" rev 0x01: msi
pci17 at ppb16 bus 17
ppb17 at pci0 dev 22 function 7 "VMware PCIE" rev 0x01: msi
pci18 at ppb17 bus 18
ppb18 at pci0 dev 23 function 0 "VMware PCIE" rev 0x01: msi
pci19 at ppb18 bus 19
ppb19 at pci0 dev 23 function 1 "VMware PCIE" rev 0x01: msi
pci20 at ppb19 bus 20
ppb20 at pci0 dev 23 function 2 "VMware PCIE" rev 0x01: msi
pci21 at ppb20 bus 21
ppb21 at pci0 dev 23 function 3 "VMware PCIE" rev 0x01: msi
pci22 at ppb21 bus 22
ppb22 at pci0 dev 23 function 4 "VMware PCIE" rev 0x01: msi
pci23 at ppb22 bus 23
ppb23 at pci0 dev 23 function 5 "VMware PCIE" rev 0x01: msi
pci24 at ppb23 bus 24
ppb24 at pci0 dev 23 function 6 "VMware PCIE" rev 0x01: msi
pci25 at ppb24 bus 25
ppb25 at pci0 dev 23 function 7 "VMware PCIE" rev 0x01: msi
pci26 at ppb25 bus 26
ppb26 at pci0 dev 24 function 0 "VMware PCIE" rev 0x01: msi
pci27 at ppb26 bus 27
ppb27 at pci0 dev 24 function 1 "VMware PCIE" rev 0x01: msi
pci28 at ppb27 bus 28
ppb28 at pci0 dev 24 function 2 "VMware PCIE" rev 0x01: msi
pci29 at ppb28 bus 29
ppb29 at pci0 dev 24 function 3 "VMware PCIE" rev 0x01: msi
pci30 at ppb29 bus 30
ppb30 at pci0 dev 24 function 4 "VMware PCIE" rev 0x01: msi
pci31 at ppb30 bus 31
ppb31 at pci0 dev 24 function 5 "VMware PCIE" rev 0x01: msi
pci32 at ppb31 bus 32
ppb32 at pci0 dev 24 function 6 "VMware PCIE" rev 0x01: msi
pci33 at ppb32 bus 33
ppb33 at pci0 dev 24 function 7 "VMware PCIE" rev 0x01: msi
pci34 at ppb33 bus 34
isa0 at pcib0
isadma0 at isa0
pckbc0 at isa0 port 0x60/5 irq 1 irq 12
pckbd0 at pckbc0 (kbd slot)
wskbd0 at pckbd0: console keyboard, using wsdisplay0
pms0 at pckbc0 (aux slot)
wsmouse0 at pms0 mux 0
pcppi0 at isa0 port 0x61
spkr0 at pcppi0
uhidev0 at uhub1 port 1 configuration 1 interface 0 "VMware VMware Virtual USB
Mouse" rev 1.10/1.02 addr 2
uhidev0: iclass 3/0
ums0 at uhidev0: 16 buttons, Z and W dir
wsmouse1 at ums0 mux 0
uhidev1 at uhub1 port 1 configuration 1 interface 1 "VMware VMware Virtual USB
Mouse" rev 1.10/1.02 addr 2
uhidev1: iclass 3/0
ums1 at uhidev1: 16 buttons, Z and W dir
wsmouse2 at ums1 mux 0
uhub2 at uhub1 port 2 configuration 1 interface 0 "VMware VMware Virtual USB
Hub" rev 1.10/1.00 addr 3
vscsi0 at root
scsibus3 at vscsi0: 256 targets
softraid0 at root
scsibus4 at softraid0: 256 targets
root on sd0a (83e4a196c47f8904.a) swap on sd0b dump on sd0b
--
ASOU Masato
From: Scott Cheloha <[email protected]>
Date: Sat, 30 Jul 2022 22:13:21 -0500
> Hi,
>
> At the urging of sthen@ and dv@, here is v5.
>
> Two major changes from v4:
>
> - Add the function tc_reset_quality() to kern_tc.c and use it
> to lower the quality of the TSC timecounter if we fail the
> sync test.
>
> tc_reset_quality() will choose a new active timecounter if,
> after the quality change, the given timecounter is no longer
> the best timecounter.
>
> The upshot is: if you fail the TSC sync test you should boot
> with the HPET as your active timecounter. If you don't have
> an HPET you'll be using something else.
>
> - Drop the SMT accomodation from the hot loop. It hasn't been
> necessary since last year when I rewrote the test to run without
> a mutex. In the rewritten test, the two CPUs in the hot loop
> are not competing for any resources so they should not be able
> to starve one another.
>
> dv: Could you double-check that this still chooses the right
> timecounter on your machine? If so, I will ask deraadt@ to
> put this into snaps to replace v4.
>
> Additional test reports are welcome. Include your dmesg.
>
> --
>
> I do not see much more I can do to improve this patch.
>
> I am seeking patch review and OKs.
>
> I am especially interested in whether my assumptions in tsc_ap_test()
> and tsc_bp_test() are correct. The whole patch depends on those
> assumptions. Is this a valid way to test for TSC desync? Or am I
> missing membar_producer()/membar_consumer() calls?
>
> Here is the long version of "what" and "why" for this patch.
>
> The patch is attached at the end.
>
> - Computing a per-CPU TSC skew value is error-prone, especially
> on multisocket machines and VMs. My best guess is that larger
> latencies appear to the skew measurement test as TSC desync,
> and so the TSC is demoted to a kernel timecounter on these
> machines or marked non-monotonic.
>
> This patch eliminates per-CPU TSC skew values. Instead of trying
> to measure and correct for TSC desync we only try to detect desync,
> which is less error-prone. This approach should allow a wider
> variety of machines to use the TSC as a timecounter when running
> OpenBSD.
>
> - In the new sync test, both CPUs repeatedly try to detect whether
> their TSC is trailing the other CPU's TSC. The upside to this
> approach is that it yields no false positives (if my assumptions
> about AMD64 memory access and instruction serialization are correct).
> The downside to this approach is that it takes more time than the
> current skew measurement test. Each test round takes 1ms, and
> we run up to two rounds per CPU, so this patch slows boot down
> by 2ms per AP.
>
> - If any CPU fails the sync test, the TSC is marked non-monotonic
> and a different timecounter is activated. The TC_USER flag
> remains intact. There is no "middle ground" where we fall back
> to only using the TSC in the kernel.
>
> - Because there is no per-CPU skew value, there is also no concept
> of TSC drift anymore.
>
> - Before running the test, we check for the IA32_TSC_ADJUST
> register and reset it if necessary. This is a trivial way
> to work around firmware bugs that desync the TSC before we
> reach the kernel.
>
> Unfortunately, at the moment this register appears to only
> be available on Intel processors and I cannot find an equivalent
> but differently-named MSR for AMD processors.
>
> --
>
> Index: sys/arch/amd64/amd64/tsc.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v
> retrieving revision 1.24
> diff -u -p -r1.24 tsc.c
> --- sys/arch/amd64/amd64/tsc.c 31 Aug 2021 15:11:54 -0000 1.24
> +++ sys/arch/amd64/amd64/tsc.c 31 Jul 2022 03:06:39 -0000
> @@ -36,13 +36,6 @@ int tsc_recalibrate;
> uint64_t tsc_frequency;
> int tsc_is_invariant;
>
> -#define TSC_DRIFT_MAX 250
> -#define TSC_SKEW_MAX 100
> -int64_t tsc_drift_observed;
> -
> -volatile int64_t tsc_sync_val;
> -volatile struct cpu_info *tsc_sync_cpu;
> -
> u_int tsc_get_timecount(struct timecounter *tc);
> void tsc_delay(int usecs);
>
> @@ -236,22 +229,12 @@ cpu_recalibrate_tsc(struct timecounter *
> u_int
> tsc_get_timecount(struct timecounter *tc)
> {
> - return rdtsc_lfence() + curcpu()->ci_tsc_skew;
> + return rdtsc_lfence();
> }
>
> void
> tsc_timecounter_init(struct cpu_info *ci, uint64_t cpufreq)
> {
> -#ifdef TSC_DEBUG
> - printf("%s: TSC skew=%lld observed drift=%lld\n", ci->ci_dev->dv_xname,
> - (long long)ci->ci_tsc_skew, (long long)tsc_drift_observed);
> -#endif
> - if (ci->ci_tsc_skew < -TSC_SKEW_MAX || ci->ci_tsc_skew > TSC_SKEW_MAX) {
> - printf("%s: disabling user TSC (skew=%lld)\n",
> - ci->ci_dev->dv_xname, (long long)ci->ci_tsc_skew);
> - tsc_timecounter.tc_user = 0;
> - }
> -
> if (!(ci->ci_flags & CPUF_PRIMARY) ||
> !(ci->ci_flags & CPUF_CONST_TSC) ||
> !(ci->ci_flags & CPUF_INVAR_TSC))
> @@ -268,111 +251,264 @@ tsc_timecounter_init(struct cpu_info *ci
> calibrate_tsc_freq();
> }
>
> - if (tsc_drift_observed > TSC_DRIFT_MAX) {
> - printf("ERROR: %lld cycle TSC drift observed\n",
> - (long long)tsc_drift_observed);
> - tsc_timecounter.tc_quality = -1000;
> - tsc_timecounter.tc_user = 0;
> - tsc_is_invariant = 0;
> - }
> -
> tc_init(&tsc_timecounter);
> }
>
> -/*
> - * Record drift (in clock cycles). Called during AP startup.
> - */
> void
> -tsc_sync_drift(int64_t drift)
> +tsc_delay(int usecs)
> {
> - if (drift < 0)
> - drift = -drift;
> - if (drift > tsc_drift_observed)
> - tsc_drift_observed = drift;
> + uint64_t interval, start;
> +
> + interval = (uint64_t)usecs * tsc_frequency / 1000000;
> + start = rdtsc_lfence();
> + while (rdtsc_lfence() - start < interval)
> + CPU_BUSY_CYCLE();
> }
>
> +#ifdef MULTIPROCESSOR
> +
> +#define TSC_DEBUG 1
> +
> +/*
> + * Protections for global variables in this code:
> + *
> + * a Modified atomically
> + * b Protected by a barrier
> + * p Only modified by the primary CPU
> + */
> +
> +#define TSC_TEST_MSECS 1 /* Test round duration */
> +#define TSC_TEST_ROUNDS 2 /* Number of test rounds */
> +
> /*
> - * Called during startup of APs, by the boot processor. Interrupts
> - * are disabled on entry.
> + * tsc_test_status.val is isolated to its own cache line to limit
> + * false sharing and reduce the test's margin of error.
> */
> +struct tsc_test_status {
> + volatile uint64_t val; /* [a] Latest RDTSC value */
> + uint64_t pad1[7];
> + uint64_t lag_count; /* [b] Number of lags seen by CPU */
> + uint64_t lag_max; /* [b] Biggest lag seen by CPU */
> + int64_t adj; /* [b] Initial IA32_TSC_ADJUST value */
> + uint64_t pad2[5];
> +} __aligned(64);
> +struct tsc_test_status tsc_ap_status; /* Test results from AP */
> +struct tsc_test_status tsc_bp_status; /* Test results from BP */
> +uint64_t tsc_test_cycles; /* [p] TSC cycles per test round */
> +const char *tsc_ap_name; /* [b] Name of AP running test */
> +volatile u_int tsc_egress_barrier; /* [a] Test end barrier */
> +volatile u_int tsc_ingress_barrier; /* [a] Test start barrier */
> +volatile u_int tsc_test_rounds; /* [p] Remaining test rounds */
> +int tsc_is_synchronized = 1; /* [p] Have we ever failed the test? */
> +
> +void tsc_report_test_results(void);
> +void tsc_reset_adjust(struct tsc_test_status *);
> +void tsc_test_ap(void);
> +void tsc_test_bp(void);
> +
> void
> -tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp)
> +tsc_test_sync_bp(struct cpu_info *ci)
> {
> - uint64_t bptsc;
> -
> - if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL)
> - panic("tsc_sync_bp: 1");
> + if (!tsc_is_invariant)
> + return;
> +#ifndef TSC_DEBUG
> + /* No point in testing again if we already failed. */
> + if (!tsc_is_synchronized)
> + return;
> +#endif
> + /* Reset IA32_TSC_ADJUST if it exists. */
> + tsc_reset_adjust(&tsc_bp_status);
>
> - /* Flag it and read our TSC. */
> - atomic_setbits_int(&ci->ci_flags, CPUF_SYNCTSC);
> - bptsc = (rdtsc_lfence() >> 1);
> + /* Reset the test cycle limit and round count. */
> + tsc_test_cycles = TSC_TEST_MSECS * tsc_frequency / 1000;
> + tsc_test_rounds = TSC_TEST_ROUNDS;
> +
> + do {
> + /*
> + * Pass through the ingress barrier, run the test,
> + * then wait for the AP to reach the egress barrier.
> + */
> + atomic_inc_int(&tsc_ingress_barrier);
> + while (tsc_ingress_barrier != 2)
> + CPU_BUSY_CYCLE();
> + tsc_test_bp();
> + while (tsc_egress_barrier != 1)
> + CPU_BUSY_CYCLE();
> +
> + /*
> + * Report what happened. Adjust the TSC's quality
> + * if this is the first time we've failed the test.
> + */
> + tsc_report_test_results();
> + if (tsc_ap_status.lag_count || tsc_bp_status.lag_count) {
> + if (tsc_is_synchronized) {
> + tsc_is_synchronized = 0;
> + tc_reset_quality(&tsc_timecounter, -1000);
> + }
> + tsc_test_rounds = 0;
> + } else
> + tsc_test_rounds--;
> +
> + /*
> + * Clean up for the next round. It is safe to reset the
> + * ingress barrier because at this point we know the AP
> + * has reached the egress barrier.
> + */
> + memset(&tsc_ap_status, 0, sizeof tsc_ap_status);
> + memset(&tsc_bp_status, 0, sizeof tsc_bp_status);
> + tsc_ingress_barrier = 0;
> + if (tsc_test_rounds == 0)
> + tsc_ap_name = NULL;
> +
> + /*
> + * Pass through the egress barrier and release the AP.
> + * The AP is responsible for resetting the egress barrier.
> + */
> + if (atomic_inc_int_nv(&tsc_egress_barrier) != 2)
> + panic("%s: unexpected egress count", __func__);
> + } while (tsc_test_rounds > 0);
> +}
>
> - /* Wait for remote to complete, and read ours again. */
> - while ((ci->ci_flags & CPUF_SYNCTSC) != 0)
> - membar_consumer();
> - bptsc += (rdtsc_lfence() >> 1);
> +void
> +tsc_test_sync_ap(struct cpu_info *ci)
> +{
> + if (!tsc_is_invariant)
> + return;
> +#ifndef TSC_DEBUG
> + if (!tsc_is_synchronized)
> + return;
> +#endif
> + /* The BP needs our name in order to report any problems. */
> + if (atomic_cas_ptr(&tsc_ap_name, NULL, ci->ci_dev->dv_xname) != NULL) {
> + panic("%s: %s: tsc_ap_name is not NULL: %s",
> + __func__, ci->ci_dev->dv_xname, tsc_ap_name);
> + }
>
> - /* Wait for the results to come in. */
> - while (tsc_sync_cpu == ci)
> - CPU_BUSY_CYCLE();
> - if (tsc_sync_cpu != NULL)
> - panic("tsc_sync_bp: 2");
> + tsc_reset_adjust(&tsc_ap_status);
>
> - *bptscp = bptsc;
> - *aptscp = tsc_sync_val;
> + /*
> + * The AP is only responsible for running the test and
> + * resetting the egress barrier. The BP handles everything
> + * else.
> + */
> + do {
> + atomic_inc_int(&tsc_ingress_barrier);
> + while (tsc_ingress_barrier != 2)
> + CPU_BUSY_CYCLE();
> + tsc_test_ap();
> + atomic_inc_int(&tsc_egress_barrier);
> + while (atomic_cas_uint(&tsc_egress_barrier, 2, 0) != 2)
> + CPU_BUSY_CYCLE();
> + } while (tsc_test_rounds > 0);
> }
>
> void
> -tsc_sync_bp(struct cpu_info *ci)
> +tsc_report_test_results(void)
> {
> - uint64_t bptsc, aptsc;
> -
> - tsc_read_bp(ci, &bptsc, &aptsc); /* discarded - cache effects */
> - tsc_read_bp(ci, &bptsc, &aptsc);
> + u_int round = TSC_TEST_ROUNDS - tsc_test_rounds + 1;
>
> - /* Compute final value to adjust for skew. */
> - ci->ci_tsc_skew = bptsc - aptsc;
> + if (tsc_bp_status.adj != 0) {
> + printf("tsc: cpu0: IA32_TSC_ADJUST: %lld -> 0\n",
> + tsc_bp_status.adj);
> + }
> + if (tsc_ap_status.adj != 0) {
> + printf("tsc: %s: IA32_TSC_ADJUST: %lld -> 0\n",
> + tsc_ap_name, tsc_ap_status.adj);
> + }
> + if (tsc_ap_status.lag_count > 0 || tsc_bp_status.lag_count > 0) {
> + printf("tsc: cpu0/%s: sync test round %u/%u failed\n",
> + tsc_ap_name, round, TSC_TEST_ROUNDS);
> + }
> + if (tsc_bp_status.lag_count > 0) {
> + printf("tsc: cpu0/%s: cpu0: %llu lags %llu cycles\n",
> + tsc_ap_name, tsc_bp_status.lag_count,
> + tsc_bp_status.lag_max);
> + }
> + if (tsc_ap_status.lag_count > 0) {
> + printf("tsc: cpu0/%s: %s: %llu lags %llu cycles\n",
> + tsc_ap_name, tsc_ap_name, tsc_ap_status.lag_count,
> + tsc_ap_status.lag_max);
> + }
> }
>
> /*
> - * Called during startup of AP, by the AP itself. Interrupts are
> - * disabled on entry.
> + * Reset IA32_TSC_ADJUST if we have it.
> + *
> + * XXX We should rearrange cpu_hatch() so that the feature
> + * flags are already set before we get here. Check CPUID
> + * by hand until then.
> */
> void
> -tsc_post_ap(struct cpu_info *ci)
> +tsc_reset_adjust(struct tsc_test_status *tts)
> {
> - uint64_t tsc;
> -
> - /* Wait for go-ahead from primary. */
> - while ((ci->ci_flags & CPUF_SYNCTSC) == 0)
> - membar_consumer();
> - tsc = (rdtsc_lfence() >> 1);
> + uint32_t eax, ebx, ecx, edx;
>
> - /* Instruct primary to read its counter. */
> - atomic_clearbits_int(&ci->ci_flags, CPUF_SYNCTSC);
> - tsc += (rdtsc_lfence() >> 1);
> -
> - /* Post result. Ensure the whole value goes out atomically. */
> - (void)atomic_swap_64(&tsc_sync_val, tsc);
> -
> - if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci)
> - panic("tsc_sync_ap");
> + CPUID(0, eax, ebx, ecx, edx);
> + if (eax >= 7) {
> + CPUID_LEAF(7, 0, eax, ebx, ecx, edx);
> + if (ISSET(ebx, SEFF0EBX_TSC_ADJUST)) {
> + tts->adj = rdmsr(MSR_TSC_ADJUST);
> + if (tts->adj != 0)
> + wrmsr(MSR_TSC_ADJUST, 0);
> + }
> + }
> }
>
> void
> -tsc_sync_ap(struct cpu_info *ci)
> +tsc_test_ap(void)
> {
> - tsc_post_ap(ci);
> - tsc_post_ap(ci);
> + uint64_t ap_val, bp_val, end, lag;
> +
> + ap_val = rdtsc_lfence();
> + end = ap_val + tsc_test_cycles;
> + while (__predict_false(ap_val < end)) {
> + /*
> + * Get the BP's latest TSC value, then read the AP's
> + * TSC. LFENCE is a serializing instruction, so we
> + * know bp_val predates ap_val. If ap_val is smaller
> + * than bp_val then the AP's TSC must trail that of
> + * the BP and the counters cannot be synchronized.
> + */
> + bp_val = tsc_bp_status.val;
> + ap_val = rdtsc_lfence();
> + tsc_ap_status.val = ap_val;
> +
> + /*
> + * Record the magnitude of the problem if the AP's TSC
> + * trails the BP's TSC.
> + */
> + if (__predict_false(ap_val < bp_val)) {
> + tsc_ap_status.lag_count++;
> + lag = bp_val - ap_val;
> + if (tsc_ap_status.lag_max < lag)
> + tsc_ap_status.lag_max = lag;
> + }
> + }
> }
>
> +/*
> + * This is similar to tsc_test_ap(), but with all relevant variables
> + * flipped around to run the test from the BP's perspective.
> + */
> void
> -tsc_delay(int usecs)
> +tsc_test_bp(void)
> {
> - uint64_t interval, start;
> + uint64_t ap_val, bp_val, end, lag;
>
> - interval = (uint64_t)usecs * tsc_frequency / 1000000;
> - start = rdtsc_lfence();
> - while (rdtsc_lfence() - start < interval)
> - CPU_BUSY_CYCLE();
> + bp_val = rdtsc_lfence();
> + end = bp_val + tsc_test_cycles;
> + while (__predict_false(bp_val < end)) {
> + ap_val = tsc_ap_status.val;
> + bp_val = rdtsc_lfence();
> + tsc_bp_status.val = bp_val;
> +
> + if (__predict_false(bp_val < ap_val)) {
> + tsc_bp_status.lag_count++;
> + lag = ap_val - bp_val;
> + if (tsc_bp_status.lag_max < lag)
> + tsc_bp_status.lag_max = lag;
> + }
> + }
> }
> +
> +#endif /* MULTIPROCESSOR */
> Index: sys/arch/amd64/amd64/cpu.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v
> retrieving revision 1.156
> diff -u -p -r1.156 cpu.c
> --- sys/arch/amd64/amd64/cpu.c 26 Apr 2022 08:35:30 -0000 1.156
> +++ sys/arch/amd64/amd64/cpu.c 31 Jul 2022 03:06:40 -0000
> @@ -772,9 +772,9 @@ cpu_init(struct cpu_info *ci)
> lcr4(cr4 & ~CR4_PGE);
> lcr4(cr4);
>
> - /* Synchronize TSC */
> + /* Check if TSC is synchronized. */
> if (cold && !CPU_IS_PRIMARY(ci))
> - tsc_sync_ap(ci);
> + tsc_test_sync_ap(ci);
> #endif
> }
>
> @@ -854,18 +854,14 @@ cpu_start_secondary(struct cpu_info *ci)
> #endif
> } else {
> /*
> - * Synchronize time stamp counters. Invalidate cache and
> - * synchronize twice (in tsc_sync_bp) to minimize possible
> - * cache effects. Disable interrupts to try and rule out any
> - * external interference.
> + * Test if TSCs are synchronized. Invalidate cache to
> + * minimize possible cache effects. Disable interrupts to
> + * try to rule out external interference.
> */
> s = intr_disable();
> wbinvd();
> - tsc_sync_bp(ci);
> + tsc_test_sync_bp(ci);
> intr_restore(s);
> -#ifdef TSC_DEBUG
> - printf("TSC skew=%lld\n", (long long)ci->ci_tsc_skew);
> -#endif
> }
>
> if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) {
> @@ -890,7 +886,6 @@ void
> cpu_boot_secondary(struct cpu_info *ci)
> {
> int i;
> - int64_t drift;
> u_long s;
>
> atomic_setbits_int(&ci->ci_flags, CPUF_GO);
> @@ -905,18 +900,11 @@ cpu_boot_secondary(struct cpu_info *ci)
> db_enter();
> #endif
> } else if (cold) {
> - /* Synchronize TSC again, check for drift. */
> - drift = ci->ci_tsc_skew;
> + /* Test if TSCs are synchronized again. */
> s = intr_disable();
> wbinvd();
> - tsc_sync_bp(ci);
> + tsc_test_sync_bp(ci);
> intr_restore(s);
> - drift -= ci->ci_tsc_skew;
> -#ifdef TSC_DEBUG
> - printf("TSC skew=%lld drift=%lld\n",
> - (long long)ci->ci_tsc_skew, (long long)drift);
> -#endif
> - tsc_sync_drift(drift);
> }
> }
>
> @@ -942,13 +930,12 @@ cpu_hatch(void *v)
> #endif
>
> /*
> - * Synchronize the TSC for the first time. Note that interrupts are
> - * off at this point.
> + * Test if our TSC is synchronized for the first time.
> + * Note that interrupts are off at this point.
> */
> wbinvd();
> ci->ci_flags |= CPUF_PRESENT;
> - ci->ci_tsc_skew = 0; /* reset on resume */
> - tsc_sync_ap(ci);
> + tsc_test_sync_ap(ci);
>
> lapic_enable();
> lapic_startclock();
> Index: sys/arch/amd64/include/cpu.h
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v
> retrieving revision 1.145
> diff -u -p -r1.145 cpu.h
> --- sys/arch/amd64/include/cpu.h 12 Jul 2022 04:46:00 -0000 1.145
> +++ sys/arch/amd64/include/cpu.h 31 Jul 2022 03:06:40 -0000
> @@ -207,8 +207,6 @@ struct cpu_info {
> paddr_t ci_vmxon_region_pa;
> struct vmxon_region *ci_vmxon_region;
>
> - int64_t ci_tsc_skew; /* counter skew vs cpu0 */
> -
> char ci_panicbuf[512];
>
> paddr_t ci_vmcs_pa;
> @@ -228,7 +226,6 @@ struct cpu_info {
> #define CPUF_INVAR_TSC 0x0100 /* CPU has invariant TSC */
> #define CPUF_USERXSTATE 0x0200 /* CPU has curproc's xsave
> state */
>
> -#define CPUF_SYNCTSC 0x0800 /* Synchronize TSC */
> #define CPUF_PRESENT 0x1000 /* CPU is present */
> #define CPUF_RUNNING 0x2000 /* CPU is running */
> #define CPUF_PAUSE 0x4000 /* CPU is paused in DDB */
> Index: sys/arch/amd64/include/cpuvar.h
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/include/cpuvar.h,v
> retrieving revision 1.11
> diff -u -p -r1.11 cpuvar.h
> --- sys/arch/amd64/include/cpuvar.h 16 May 2021 04:33:05 -0000 1.11
> +++ sys/arch/amd64/include/cpuvar.h 31 Jul 2022 03:06:40 -0000
> @@ -97,8 +97,7 @@ void identifycpu(struct cpu_info *);
> void cpu_init(struct cpu_info *);
> void cpu_init_first(void);
>
> -void tsc_sync_drift(int64_t);
> -void tsc_sync_bp(struct cpu_info *);
> -void tsc_sync_ap(struct cpu_info *);
> +void tsc_test_sync_bp(struct cpu_info *);
> +void tsc_test_sync_ap(struct cpu_info *);
>
> #endif
> Index: sys/sys/timetc.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/timetc.h,v
> retrieving revision 1.12
> diff -u -p -r1.12 timetc.h
> --- sys/sys/timetc.h 6 Jul 2020 13:33:09 -0000 1.12
> +++ sys/sys/timetc.h 31 Jul 2022 03:06:40 -0000
> @@ -120,6 +120,7 @@ extern struct timekeep *timekeep;
> u_int64_t tc_getfrequency(void);
> u_int64_t tc_getprecision(void);
> void tc_init(struct timecounter *tc);
> +void tc_reset_quality(struct timecounter *, int);
> void tc_setclock(const struct timespec *ts);
> void tc_setrealtimeclock(const struct timespec *ts);
> void tc_ticktock(void);
> Index: sys/kern/kern_tc.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_tc.c,v
> retrieving revision 1.76
> diff -u -p -r1.76 kern_tc.c
> --- sys/kern/kern_tc.c 23 Jul 2022 22:58:51 -0000 1.76
> +++ sys/kern/kern_tc.c 31 Jul 2022 03:06:40 -0000
> @@ -458,6 +458,38 @@ tc_init(struct timecounter *tc)
> timecounter = tc;
> }
>
> +/*
> + * Change the given timecounter's quality. If it is the active
> + * counter and it is no longer the best counter, activate the
> + * best counter.
> + */
> +void
> +tc_reset_quality(struct timecounter *tc, int quality)
> +{
> + struct timecounter *best = &dummy_timecounter, *tmp;
> +
> + if (tc == &dummy_timecounter)
> + panic("%s: cannot change dummy counter quality", __func__);
> +
> + tc->tc_quality = quality;
> + if (timecounter == tc) {
> + SLIST_FOREACH(tmp, &tc_list, tc_next) {
> + if (tmp->tc_quality < 0)
> + continue;
> + if (tmp->tc_quality < best->tc_quality)
> + continue;
> + if (tmp->tc_quality == best->tc_quality &&
> + tmp->tc_frequency < best->tc_frequency)
> + continue;
> + best = tmp;
> + }
> + if (best != tc) {
> + enqueue_randomness(best->tc_get_timecount(best));
> + timecounter = best;
> + }
> + }
> +}
> +
> /* Report the frequency of the current timecounter. */
> u_int64_t
> tc_getfrequency(void)
>