Mark the TSC as reliable if the hypervisor (KVM) has enumerated the TSC
as constant and nonstop.  Like most (all?) virtualization setups, any
secondary clocksource that's used as a watchdog is guaranteed to be less
reliable than a constant, nonstop TSC, as all clocksources the kernel uses
as a watchdog are all but guaranteed to be emulated when running as a KVM
guest.  I.e. any observed discrepancies between the TSC and watchdog will
be due to jitter in the watchdog.

This is especially true for KVM, as the watchdog clocksource is usually
emulated in host userspace, i.e. reading the clock incurs a roundtrip
cost of thousands of cycles.

Marking the TSC reliable addresses a flaw where the TSC will occasionally
be marked unstable if the host is under moderate/heavy load.

Reviewed-by: David Woodhouse <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
 arch/x86/include/asm/kvm_para.h |  2 +-
 arch/x86/kernel/kvm.c           | 12 +++++++++++-
 arch/x86/kernel/kvmclock.c      | 14 +++++---------
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 4a47c16e2df8..4a49fc286b4c 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -118,7 +118,7 @@ static inline long kvm_sev_hypercall3(unsigned int nr, 
unsigned long p1,
 }
 
 #ifdef CONFIG_KVM_GUEST
-void kvmclock_init(void);
+void kvmclock_init(bool prefer_tsc);
 void kvmclock_disable(void);
 bool kvm_para_available(void);
 unsigned int kvm_arch_para_features(void);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 909d3e5e5bcd..1cef54e1e7d9 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -978,6 +978,7 @@ static void __init kvm_init_platform(void)
                .mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - 1) >> 32,
        };
        u32 timing_info_leaf;
+       bool tsc_is_reliable;
 
        if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
            kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
@@ -1040,7 +1041,16 @@ static void __init kvm_init_platform(void)
                }
        }
 
-       kvmclock_init();
+        /*
+         * If the TSC counts at a constant frequency across P/T states and in
+         * deep C-states, treat the TSC reliable, as guaranteed by KVM.
+         */
+       tsc_is_reliable = boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+                         boot_cpu_has(X86_FEATURE_NONSTOP_TSC);
+       if (tsc_is_reliable)
+               setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
+       kvmclock_init(tsc_is_reliable);
        x86_platform.apic_post_init = kvm_apic_init;
 
        /*
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f55d0305d1f3..2e7ab54cb9dc 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -307,7 +307,7 @@ static int kvmclock_setup_percpu(unsigned int cpu)
        return p ? 0 : -ENOMEM;
 }
 
-void __init kvmclock_init(void)
+void __init kvmclock_init(bool prefer_tsc)
 {
        u8 flags;
 
@@ -356,15 +356,11 @@ void __init kvmclock_init(void)
        kvm_get_preset_lpj();
 
        /*
-        * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate
-        * with P/T states and does not stop in deep C-states.
-        *
-        * Invariant TSC exposed by host means kvmclock is not necessary:
-        * can use TSC as clocksource.
-        *
+        * If TSC is preferred over kvmlock, drop kvmclock's rating so that TSC
+        * is chosen as the clocksource (but still register kvmclock in case
+        * the kernel doesn't want to use TSC for whatever reason).
         */
-       if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
-           boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+       if (prefer_tsc)
                kvm_clock.rating = 299;
 
        clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
-- 
2.55.0.rc0.799.gd6f94ed593-goog


Reply via email to