On Thu, Jul 29, 2021, Oliver Upton wrote:
> To date, VMM-directed TSC synchronization and migration has been a bit
> messy. KVM has some baked-in heuristics around TSC writes to infer if
> the VMM is attempting to synchronize. This is problematic, as it depends
> on host userspace writing to the guest's TSC within 1 second of the last
> write.
> 
> A much cleaner approach to configuring the guest's views of the TSC is to
> simply migrate the TSC offset for every vCPU. Offsets are idempotent,
> and thus not subject to change depending on when the VMM actually
> reads/writes values from/to KVM. The VMM can then read the TSC once with
> KVM_GET_CLOCK to capture a (realtime, host_tsc) pair at the instant when
> the guest is paused.
> 
> Cc: David Matlack <[email protected]>
> Signed-off-by: Oliver Upton <[email protected]>
> ---
>  Documentation/virt/kvm/devices/vcpu.rst |  57 ++++++++
>  arch/x86/include/asm/kvm_host.h         |   1 +
>  arch/x86/include/uapi/asm/kvm.h         |   4 +
>  arch/x86/kvm/x86.c                      | 167 ++++++++++++++++++++++++
>  4 files changed, 229 insertions(+)
> 
> diff --git a/Documentation/virt/kvm/devices/vcpu.rst 
> b/Documentation/virt/kvm/devices/vcpu.rst
> index 2acec3b9ef65..0f46f2588905 100644
> --- a/Documentation/virt/kvm/devices/vcpu.rst
> +++ b/Documentation/virt/kvm/devices/vcpu.rst
> @@ -161,3 +161,60 @@ Specifies the base address of the stolen time structure 
> for this VCPU. The
>  base address must be 64 byte aligned and exist within a valid guest memory
>  region. See Documentation/virt/kvm/arm/pvtime.rst for more information
>  including the layout of the stolen time structure.
> +
> +4. GROUP: KVM_VCPU_TSC_CTRL
> +===========================
> +
> +:Architectures: x86
> +
> +4.1 ATTRIBUTE: KVM_VCPU_TSC_OFFSET
> +
> +:Parameters: 64-bit unsigned TSC offset
> +
> +Returns:
> +
> +      ======= ======================================
> +      -EFAULT Error reading/writing the provided
> +              parameter address.

There's a rogue space in here (git show highlights it).

> +      -ENXIO  Attribute not supported
> +      ======= ======================================
> +
> +Specifies the guest's TSC offset relative to the host's TSC. The guest's
> +TSC is then derived by the following equation:
> +
> +  guest_tsc = host_tsc + KVM_VCPU_TSC_OFFSET
> +
> +This attribute is useful for the precise migration of a guest's TSC. The
> +following describes a possible algorithm to use for the migration of a
> +guest's TSC:
> +
> +From the source VMM process:
> +
> +1. Invoke the KVM_GET_CLOCK ioctl to record the host TSC (t_0),
> +   kvmclock nanoseconds (k_0), and realtime nanoseconds (r_0).
> +
> +2. Read the KVM_VCPU_TSC_OFFSET attribute for every vCPU to record the
> +   guest TSC offset (off_n).
> +
> +3. Invoke the KVM_GET_TSC_KHZ ioctl to record the frequency of the
> +   guest's TSC (freq).
> +
> +From the destination VMM process:
> +
> +4. Invoke the KVM_SET_CLOCK ioctl, providing the kvmclock nanoseconds
> +   (k_0) and realtime nanoseconds (r_0) in their respective fields.
> +   Ensure that the KVM_CLOCK_REALTIME flag is set in the provided
> +   structure. KVM will advance the VM's kvmclock to account for elapsed
> +   time since recording the clock values.
> +
> +5. Invoke the KVM_GET_CLOCK ioctl to record the host TSC (t_1) and
> +   kvmclock nanoseconds (k_1).
> +
> +6. Adjust the guest TSC offsets for every vCPU to account for (1) time
> +   elapsed since recording state and (2) difference in TSCs between the
> +   source and destination machine:
> +
> +   new_off_n = t_0 + off_n + (k_1 - k_0) * freq - t_1
> +
> +7. Write the KVM_VCPU_TSC_OFFSET attribute for every vCPU with the
> +   respective value derived in the previous step.
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 65a20c64f959..855698923dd0 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1070,6 +1070,7 @@ struct kvm_arch {
>       u64 last_tsc_nsec;
>       u64 last_tsc_write;
>       u32 last_tsc_khz;
> +     u64 last_tsc_offset;
>       u64 cur_tsc_nsec;
>       u64 cur_tsc_write;
>       u64 cur_tsc_offset;
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index a6c327f8ad9e..0b22e1e84e78 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -503,4 +503,8 @@ struct kvm_pmu_event_filter {
>  #define KVM_PMU_EVENT_ALLOW 0
>  #define KVM_PMU_EVENT_DENY 1
>  
> +/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
> +#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) 
> */
> +#define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
> +
>  #endif /* _ASM_X86_KVM_H */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 27435a07fb46..17d87a8d0c75 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2413,6 +2413,11 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu 
> *vcpu, u64 l1_offset)
>       static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
>  }
>  
> +static u64 kvm_vcpu_read_tsc_offset(struct kvm_vcpu *vcpu)
> +{
> +     return vcpu->arch.l1_tsc_offset;
> +}

This is not a helpful, uh, helper.  Based on the name, I would expect it to read
the TSC_OFFSET in the context of L1 vs. L2.  Assuming you really want L1's 
offset,
just read vcpu->arch.l1_tsc_offset directly.  That will obviate the "need" for
a local offset (see below).

> +
>  static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 
> l1_multiplier)
>  {
>       vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
> @@ -2469,6 +2474,7 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu 
> *vcpu, u64 offset, u64 tsc,
>       kvm->arch.last_tsc_nsec = ns;
>       kvm->arch.last_tsc_write = tsc;
>       kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
> +     kvm->arch.last_tsc_offset = offset;
>  
>       vcpu->arch.last_guest_tsc = tsc;
>  
> @@ -4928,6 +4934,137 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
>       return 0;
>  }
>  
> +static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
> +                              struct kvm_device_attr *attr)
> +{
> +     int r;
> +
> +     switch (attr->attr) {
> +     case KVM_VCPU_TSC_OFFSET:
> +             r = 0;
> +             break;
> +     default:
> +             r = -ENXIO;
> +     }
> +
> +     return r;
> +}
> +
> +static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
> +                              struct kvm_device_attr *attr)
> +{
> +     void __user *uaddr = (void __user *)attr->addr;
> +     int r;
> +
> +     switch (attr->attr) {
> +     case KVM_VCPU_TSC_OFFSET: {
> +             u64 offset;
> +
> +             offset = kvm_vcpu_read_tsc_offset(vcpu);
> +             r = -EFAULT;
> +             if (copy_to_user(uaddr, &offset, sizeof(offset)))
> +                     break;

Use put_user(), then this all becomes short and sweet without curly braces:

        case KVM_VCPU_TSC_OFFSET:
                r = -EFAULT;
                if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
                        break;
                r = 0;
                break;

> +
> +             r = 0;
> +             break;
> +     }
> +     default:
> +             r = -ENXIO;
> +     }
> +
> +     return r;
> +}
> +
> +static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
> +                              struct kvm_device_attr *attr)
> +{
> +     void __user *uaddr = (void __user *)attr->addr;
> +     struct kvm *kvm = vcpu->kvm;
> +     int r;
> +
> +     switch (attr->attr) {
> +     case KVM_VCPU_TSC_OFFSET: {
> +             u64 offset, tsc, ns;
> +             unsigned long flags;
> +             bool matched;
> +
> +             r = -EFAULT;
> +             if (copy_from_user(&offset, uaddr, sizeof(offset)))

This can be get_user().

> +                     break;
> +
> +             raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
> +
> +             matched = (vcpu->arch.virtual_tsc_khz &&
> +                        kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz 
> &&
> +                        kvm->arch.last_tsc_offset == offset);
> +
> +             tsc = kvm_scale_tsc(vcpu, rdtsc(), 
> vcpu->arch.l1_tsc_scaling_ratio) + offset;
> +             ns = get_kvmclock_base_ns();
> +
> +             __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
> +             raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
> +
> +             r = 0;
> +             break;
> +     }
> +     default:
> +             r = -ENXIO;
> +     }
> +
> +     return r;
> +}

...

>  static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
>                                    struct kvm_enable_cap *cap)
>  {
> @@ -5382,6 +5519,36 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>               r = __set_sregs2(vcpu, u.sregs2);
>               break;
>       }
> +     case KVM_HAS_DEVICE_ATTR: {
> +             struct kvm_device_attr attr;
> +
> +             r = -EFAULT;
> +             if (copy_from_user(&attr, argp, sizeof(attr)))
> +                     goto out;
> +
> +             r = kvm_vcpu_ioctl_has_device_attr(vcpu, &attr);
> +             break;
> +     }
> +     case KVM_GET_DEVICE_ATTR: {
> +             struct kvm_device_attr attr;
> +
> +             r = -EFAULT;
> +             if (copy_from_user(&attr, argp, sizeof(attr)))
> +                     goto out;
> +
> +             r = kvm_vcpu_ioctl_get_device_attr(vcpu, &attr);
> +             break;
> +     }
> +     case KVM_SET_DEVICE_ATTR: {
> +             struct kvm_device_attr attr;
> +
> +             r = -EFAULT;
> +             if (copy_from_user(&attr, argp, sizeof(attr)))
> +                     goto out;
> +
> +             r = kvm_vcpu_ioctl_set_device_attr(vcpu, &attr);
> +             break;

That's a lot of copy paste code, and I omitted a big chunk, too.  What about
something like:

static int kvm_vcpu_ioctl_do_device_attr(struct kvm_vcpu *vcpu,
                                         unsigned int ioctl, void __user *argp)
{
        struct kvm_device_attr attr;
        int r;

        if (copy_from_user(&attr, argp, sizeof(attr)))
                return -EFAULT;

        if (attr->group != KVM_VCPU_TSC_CTRL)
                return -ENXIO;

        switch (ioctl) {
        case KVM_HAS_DEVICE_ATTR:
                r = kvm_arch_tsc_has_attr(vcpu, attr);
                break;
        case KVM_GET_DEVICE_ATTR:
                r = kvm_arch_tsc_get_attr(vcpu, attr);
                break;
        case KVM_SET_DEVICE_ATTR:
                r = kvm_arch_tsc_set_attr(vcpu, attr);
                break;
        default:
                BUG();
        }
        return r;
}

and then:

        case KVM_GET_DEVICE_ATTR:
        case KVM_HAS_DEVICE_ATTR:
        case KVM_SET_DEVICE_ATTR:
                r = kvm_vcpu_ioctl_do_device_attr(vcpu, ioctl, argp);
                break;


or if we want to plan for the future a bit more:

static int kvm_vcpu_ioctl_do_device_attr(struct kvm_vcpu *vcpu,
                                         unsigned int ioctl, void __user *argp)
{
        struct kvm_device_attr attr;
        int r;

        if (copy_from_user(&attr, argp, sizeof(attr)))
                return -EFAULT;

        r = -ENXIO;

        switch (ioctl) {
        case KVM_HAS_DEVICE_ATTR:
                if (attr->group != KVM_VCPU_TSC_CTRL)
                        r = kvm_arch_tsc_has_attr(vcpu, attr);
                break;
        case KVM_GET_DEVICE_ATTR:
                if (attr->group != KVM_VCPU_TSC_CTRL)
                        r = kvm_arch_tsc_get_attr(vcpu, attr);
                break;
        case KVM_SET_DEVICE_ATTR:
                if (attr->group != KVM_VCPU_TSC_CTRL)
                        r = kvm_arch_tsc_set_attr(vcpu, attr);
                break;
        }
        return r;
}

> +     }
>       default:
>               r = -EINVAL;
>       }
> -- 
> 2.32.0.432.gabb21c7263-goog
> 
_______________________________________________
kvmarm mailing list
[email protected]
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

Reply via email to