On 4/8/2024 3:07 PM, Jack Allister wrote:
> This test proves that there is an inherent KVM/PV clock drift away from the
> guest TSC when KVM decides to update the PV time information structure due
> to a KVM_REQ_MASTERCLOCK_UPDATE. This drift is exascerbated when a guest is
> using TSC scaling and running at a different frequency to the host TSC [1].
> It also proves that KVM_[GS]ET_CLOCK_GUEST API is working to mitigate the
> drift from TSC to within ±1ns.
> 
> The test simply records the PVTI (PV time information) at time of guest
> creation, after KVM has updated it's mapped PVTI structure and once the
> correction has taken place.
> 
> A singular point in time is then recorded via the guest TSC and is used to
> calculate the a PV clock value using each of the 3 PVTI structures.
> 
> As seen below a drift of ~3500ns is observed if no correction has taken
> place after KVM has updated the PVTI via master clock update. However,
> after the correction a delta of at most 1ns can be seen.
> 
> * selftests: kvm: pvclock_test
> * scaling tsc from 2999999KHz to 1499999KHz
> * before=5038374946 uncorrected=5038371437 corrected=5038374945
> * delta_uncorrected=3509 delta_corrected=1
> 
> Clocksource check code has been borrowed from [2].
> 
> [1]: 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=451a707813ae
> [2]: 
> https://lore.kernel.org/kvm/[email protected]/
> 
> Signed-off-by: Jack Allister <[email protected]>
> CC: David Woodhouse <[email protected]>
> CC: Paul Durrant <[email protected]>
> ---
>  tools/testing/selftests/kvm/Makefile          |   1 +
>  .../selftests/kvm/x86_64/pvclock_test.c       | 223 ++++++++++++++++++
>  2 files changed, 224 insertions(+)
>  create mode 100644 tools/testing/selftests/kvm/x86_64/pvclock_test.c
> 
> diff --git a/tools/testing/selftests/kvm/Makefile 
> b/tools/testing/selftests/kvm/Makefile
> index 741c7dc16afc..02ee1205bbed 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -87,6 +87,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/pmu_counters_test
>  TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
>  TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test
>  TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test
> +TEST_GEN_PROGS_x86_64 += x86_64/pvclock_test
>  TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
>  TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
>  TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test
> diff --git a/tools/testing/selftests/kvm/x86_64/pvclock_test.c 
> b/tools/testing/selftests/kvm/x86_64/pvclock_test.c
> new file mode 100644
> index 000000000000..172ef4d19c60
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/x86_64/pvclock_test.c
> @@ -0,0 +1,223 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright © 2024, Amazon.com, Inc. or its affiliates.
> + *
> + * Tests for pvclock API
> + * KVM_SET_CLOCK_GUEST/KVM_GET_CLOCK_GUEST
> + */
> +#include <asm/pvclock.h>
> +#include <asm/pvclock-abi.h>
> +#include <sys/stat.h>
> +#include <stdint.h>
> +#include <stdio.h>
> +
> +#include "test_util.h"
> +#include "kvm_util.h"
> +#include "processor.h"
> +
> +enum {
> +     STAGE_FIRST_BOOT,
> +     STAGE_UNCORRECTED,
> +     STAGE_CORRECTED,
> +     NUM_STAGES
> +};
> +
> +#define KVMCLOCK_GPA 0xc0000000ull
> +#define KVMCLOCK_SIZE sizeof(struct pvclock_vcpu_time_info)
> +
> +static void trigger_pvti_update(vm_paddr_t pvti_pa)
> +{
> +     /*
> +      * We need a way to trigger KVM to update the fields
> +      * in the PV time info. The easiest way to do this is
> +      * to temporarily switch to the old KVM system time
> +      * method and then switch back to the new one.
> +      */
> +     wrmsr(MSR_KVM_SYSTEM_TIME, pvti_pa | KVM_MSR_ENABLED);
> +     wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED);
> +}
> +
> +static void guest_code(vm_paddr_t pvti_pa)
> +{
> +     struct pvclock_vcpu_time_info *pvti_va =
> +             (struct pvclock_vcpu_time_info *)pvti_pa;
> +
> +     struct pvclock_vcpu_time_info pvti_boot;
> +     struct pvclock_vcpu_time_info pvti_uncorrected;
> +     struct pvclock_vcpu_time_info pvti_corrected;
> +     uint64_t cycles_boot;
> +     uint64_t cycles_uncorrected;
> +     uint64_t cycles_corrected;
> +     uint64_t tsc_guest;
> +
> +     /*
> +      * Setup the KVMCLOCK in the guest & store the original
> +      * PV time structure that is used.
> +      */
> +     wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED);
> +     pvti_boot = *pvti_va;
> +     GUEST_SYNC(STAGE_FIRST_BOOT);
> +
> +     /*
> +      * Trigger an update of the PVTI, if we calculate
> +      * the KVM clock using this structure we'll see
> +      * a drift from the TSC.
> +      */
> +     trigger_pvti_update(pvti_pa);
> +     pvti_uncorrected = *pvti_va;
> +     GUEST_SYNC(STAGE_UNCORRECTED);
> +
> +     /*
> +      * The test should have triggered the correction by this
> +      * point in time. We have a copy of each of the PVTI structs
> +      * at each stage now.
> +      *
> +      * Let's sample the timestamp at a SINGLE point in time and
> +      * then calculate what the KVM clock would be using the PVTI
> +      * from each stage.
> +      *
> +      * Then return each of these values to the tester.
> +      */
> +     pvti_corrected = *pvti_va;
> +     tsc_guest = rdtsc();
> +
> +     cycles_boot = __pvclock_read_cycles(&pvti_boot, tsc_guest);
> +     cycles_uncorrected = __pvclock_read_cycles(&pvti_uncorrected, 
> tsc_guest);
> +     cycles_corrected = __pvclock_read_cycles(&pvti_corrected, tsc_guest);
> +
> +     GUEST_SYNC_ARGS(STAGE_CORRECTED, cycles_boot, cycles_uncorrected,
> +                     cycles_corrected, 0);
> +}
> +
> +static void run_test(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
> +{
> +     struct ucall uc;
> +     uint64_t ucall_reason;
> +     struct pvclock_vcpu_time_info pvti_before;
> +     uint64_t before, uncorrected, corrected;
> +     int64_t delta_uncorrected, delta_corrected;
> +
> +     /* Loop through each stage of the test. */
> +     while (true) {
> +
> +             /* Start/restart the running vCPU code. */
> +             vcpu_run(vcpu);
> +             TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> +
> +             /* Retrieve and verify our stage. */
> +             ucall_reason = get_ucall(vcpu, &uc);
> +             TEST_ASSERT(ucall_reason == UCALL_SYNC,
> +                         "Unhandled ucall reason=%lu",
> +                         ucall_reason);
> +
> +             /* Run host specific code relating to stage. */
> +             switch (uc.args[1]) {
> +             case STAGE_FIRST_BOOT:
> +                     /* Store the KVM clock values before an update. */
> +                     vm_ioctl(vm, KVM_GET_CLOCK_GUEST, &pvti_before);
> +
> +                     /* Sleep for a set amount of time to induce drift. */
> +                     sleep(5);
> +                     break;
> +
> +             case STAGE_UNCORRECTED:
> +                     /* Restore the KVM clock values. */
> +                     vm_ioctl(vm, KVM_SET_CLOCK_GUEST, &pvti_before);
> +                     break;
> +
> +             case STAGE_CORRECTED:
> +                     /* Query the clock information and verify delta. */
> +                     before = uc.args[2];
> +                     uncorrected = uc.args[3];
> +                     corrected = uc.args[4];
> +
> +                     delta_uncorrected = before - uncorrected;
> +                     delta_corrected = before - corrected;
> +
> +                     pr_info("before=%lu uncorrected=%lu corrected=%lu\n",
> +                             before, uncorrected, corrected);
> +
> +                     pr_info("delta_uncorrected=%ld delta_corrected=%ld\n",
> +                             delta_uncorrected, delta_corrected);
> +
> +                     TEST_ASSERT((delta_corrected <= 1) && (delta_corrected 
> >= -1),
> +                                 "larger than expected delta detected = 
> %ld", delta_corrected);

I'm wondering what's the underling theory that we definitely can achieve
±1ns accuracy? I tested it on a Sapphire Rapids @2100MHz TSC frequency,
and I can see delta_corrected=2 in ~2% cases.

Reply via email to