On Wed, May 20, 2026, David Woodhouse wrote:
> On Fri, 2026-05-15 at 12:19 -0700, Sean Christopherson wrote:
> > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> > index b5991d53fc0e..e9e7394140dd 100644
> > --- a/arch/x86/kernel/kvmclock.c
> > +++ b/arch/x86/kernel/kvmclock.c
> > @@ -321,8 +321,8 @@ void __init kvmclock_init(void)
> >     flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
> >     kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
> >  
> > -   x86_platform.calibrate_tsc = kvm_get_tsc_khz;
> > -   x86_platform.calibrate_cpu = kvm_get_tsc_khz;
> > +   tsc_register_calibration_routines(kvm_get_tsc_khz, kvm_get_tsc_khz);
> > +
> >     x86_platform.get_wallclock = kvm_get_wallclock;
> >     x86_platform.set_wallclock = kvm_set_wallclock;
> >  #ifdef CONFIG_X86_LOCAL_APIC
> 
> Can we move those (and maybe everything in the context there too) up
> *before* the check for no-kvmclock at the top of the function?

Oof, I was going to say "no", but disabling kvmclock is exactly the workaround
I've told people to use to get the kernel to use the TSC instead of kvmclock.

> Probably in a separate patch.

Ya.  I think this?

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 08ee4bc304c8..92a1ebf31e4d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -27,6 +27,7 @@ static int kvmclock_vsyscall __initdata = 1;
 static int msr_kvm_system_time __ro_after_init;
 static int msr_kvm_wall_clock __ro_after_init;
 static u64 kvm_sched_clock_offset __ro_after_init;
+static unsigned int kvm_tsc_khz_cpuid __ro_after_init;
 
 static int __init parse_no_kvmclock(char *arg)
 {
@@ -207,7 +208,7 @@ static unsigned long kvm_get_tsc_khz(void)
                lapic_timer_period = apic_khz * 1000 / HZ;
 #endif
 
-       return kvm_para_tsc_khz() ? : pvclock_tsc_khz(this_cpu_pvti());
+       return kvm_tsc_khz_cpuid ? : pvclock_tsc_khz(this_cpu_pvti());
 }
 
 static unsigned long kvm_get_cpu_khz(void)
@@ -387,9 +388,39 @@ void __init kvmclock_init(void)
        enum tsc_properties tsc_properties = TSC_FREQUENCY_KNOWN;
        bool stable = false;
 
-       if (!kvm_para_available() || !kvmclock)
+       if (!kvm_para_available())
                return;
 
+       /*
+        * If the TSC counts at a constant frequency across P/T states, counts
+        * in deep C-states, and the TSC hasn't been marked unstable, treat the
+        * TSC reliable, as guaranteed by KVM.  Note, the TSC unstable check
+        * exists purely to honor the TSC being marked unstable via command
+        * line, any runtime detection of an unstable will happen after this.
+        */
+       if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+           boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+           !check_tsc_unstable())
+               tsc_properties = TSC_FREQ_KNOWN_AND_RELIABLE;
+
+       kvm_tsc_khz_cpuid = kvm_para_tsc_khz();
+
+       /*
+        * If provided, use the TSC (and APIC bus) frequency provided in KVM's
+        * PV CPUID leaf even if kvmclock itself is disabled via command line.
+        * The PV CPUID information isn't dependent on kvmclock in any way, and
+        * in fact using the precise information is *more* important when the
+        * user has explicitly disabled kvmclock to force the kernel to use the
+        * TSC as its clocksource.
+        */
+       if (!kvmclock) {
+               if (kvm_tsc_khz_cpuid)
+                       tsc_register_calibration_routines(kvm_get_tsc_khz,
+                                                         kvm_get_cpu_khz,
+                                                         tsc_properties);
+               return;
+       }
+
        if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
                msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
                msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
@@ -424,21 +455,14 @@ void __init kvmclock_init(void)
        }
 
        /*
-        * If the TSC counts at a constant frequency across P/T states, counts
-        * in deep C-states, and the TSC hasn't been marked unstable, prefer
-        * the TSC over kvmclock for sched_clock and drop kvmclock's rating so
-        * that TSC is chosen as the clocksource.  Note, the TSC unstable check
-        * exists purely to honor the TSC being marked unstable via command
-        * line, any runtime detection of an unstable will happen after this.
+        * If the TSC is reliable (see above), prefer the TSC over kvmclock for
+        * sched_clock and drop kvmclock's rating so that TSC is chosen as the
+        * clocksource.
         */
-       if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
-           boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
-           !check_tsc_unstable()) {
+       if (tsc_properties & TSC_RELIABLE)
                kvm_clock.rating = 299;
-               tsc_properties = TSC_FREQ_KNOWN_AND_RELIABLE;
-       } else {
+       else
                kvm_sched_clock_init(stable);
-       }
 
        tsc_register_calibration_routines(kvm_get_tsc_khz, kvm_get_cpu_khz,
                                          tsc_properties);

Reply via email to