[PATCH 4/6] KVM: x86: Introduce KVM_GET_SREGS2 / KVM_SET_SREGS2

2021-04-01 Thread Maxim Levitsky
This is a new version of KVM_GET_SREGS / KVM_SET_SREGS ioctls,
aiming to replace them.

It has the following changes:
   * Has flags for future extensions
   * Has vcpu's PDPTS, which allows to save/restore them on migration.
   * Lacks obsolete interrupt bitmap (done now via KVM_SET_VCPU_EVENTS)

New capability, KVM_CAP_SREGS2 is added to signal
userspace of this ioctl.

Currently only implemented on x86.

Signed-off-by: Maxim Levitsky 
---
 Documentation/virt/kvm/api.rst  |  43 ++
 arch/x86/include/asm/kvm_host.h |   7 ++
 arch/x86/include/uapi/asm/kvm.h |  13 +++
 arch/x86/kvm/kvm_cache_regs.h   |   5 ++
 arch/x86/kvm/x86.c  | 136 ++--
 include/uapi/linux/kvm.h|   5 ++
 6 files changed, 185 insertions(+), 24 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 38e327d4b479..b006d5b5f554 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4941,6 +4941,49 @@ see KVM_XEN_VCPU_SET_ATTR above.
 The KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST type may not be used
 with the KVM_XEN_VCPU_GET_ATTR ioctl.
 
+
+4.131 KVM_GET_SREGS2
+--
+
+:Capability: KVM_CAP_SREGS2
+:Architectures: x86
+:Type: vcpu ioctl
+:Parameters: struct kvm_sregs2 (out)
+:Returns: 0 on success, -1 on error
+
+Reads special registers from the vcpu.
+This ioctl is preferred over KVM_GET_SREGS when available.
+
+::
+
+struct kvm_sregs2 {
+   /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */
+   struct kvm_segment cs, ds, es, fs, gs, ss;
+   struct kvm_segment tr, ldt;
+   struct kvm_dtable gdt, idt;
+   __u64 cr0, cr2, cr3, cr4, cr8;
+   __u64 efer;
+   __u64 apic_base;
+   __u64 flags; /* must be zero*/
+   __u64 pdptrs[4];
+   __u64 padding;
+};
+
+
+4.132 KVM_SET_SREGS2
+--
+
+:Capability: KVM_CAP_SREGS2
+:Architectures: x86
+:Type: vcpu ioctl
+:Parameters: struct kvm_sregs2 (in)
+:Returns: 0 on success, -1 on error
+
+Writes special registers into the vcpu.
+See KVM_GET_SREGS2 for the data structures.
+This ioctl is preferred over the KVM_SET_SREGS when available.
+
+
 5. The kvm_run structure
 
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a52f973bdff6..87b680d111f9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -838,6 +838,13 @@ struct kvm_vcpu_arch {
 
/* Protected Guests */
bool guest_state_protected;
+
+   /*
+* Do we need to reload the pdptrs when entering nested state?
+* Set after nested migration if userspace didn't use the
+* newer KVM_SET_SREGS2 ioctl to load pdptrs from the migration state.
+*/
+   bool reload_pdptrs_on_nested_entry;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 5a3022c8af82..201a85884c81 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -159,6 +159,19 @@ struct kvm_sregs {
__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
 };
 
+struct kvm_sregs2 {
+   /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */
+   struct kvm_segment cs, ds, es, fs, gs, ss;
+   struct kvm_segment tr, ldt;
+   struct kvm_dtable gdt, idt;
+   __u64 cr0, cr2, cr3, cr4, cr8;
+   __u64 efer;
+   __u64 apic_base;
+   __u64 flags; /* must be zero*/
+   __u64 pdptrs[4];
+   __u64 padding;
+};
+
 /* for KVM_GET_FPU and KVM_SET_FPU */
 struct kvm_fpu {
__u8  fpr[8][16];
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 07d607947805..1a6e2de4248a 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -120,6 +120,11 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, 
int index)
return vcpu->arch.walk_mmu->pdptrs[index];
 }
 
+static inline void kvm_pdptr_write(struct kvm_vcpu *vcpu, int index, u64 value)
+{
+   vcpu->arch.walk_mmu->pdptrs[index] = value;
+}
+
 static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
 {
ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a9d95f90a048..f10a37f88c30 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -112,6 +112,9 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, 
unsigned long rflags);
 static void store_regs(struct kvm_vcpu *vcpu);
 static int sync_regs(struct kvm_vcpu *vcpu);
 
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+
 struct kvm_x86_ops kvm_x86_ops __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
@@ -3796,6 +3799,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
case KVM_CAP_X86_USER_SPACE_MSR:
case KVM_CAP_X86_MSR_FILTER:
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:

Re: [PATCH 4/6] KVM: x86: Introduce KVM_GET_SREGS2 / KVM_SET_SREGS2

2021-04-01 Thread Paolo Bonzini

Just a quick review on the API:

On 01/04/21 16:18, Maxim Levitsky wrote:

+struct kvm_sregs2 {
+   /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */
+   struct kvm_segment cs, ds, es, fs, gs, ss;
+   struct kvm_segment tr, ldt;
+   struct kvm_dtable gdt, idt;
+   __u64 cr0, cr2, cr3, cr4, cr8;
+   __u64 efer;
+   __u64 apic_base;
+   __u64 flags; /* must be zero*/


I think it would make sense to define a flag bit for the PDPTRs, so that 
userspace can use KVM_SET_SREGS2 unconditionally (e.g. even when 
migrating from a source that uses KVM_GET_SREGS and therefore doesn't 
provide the PDPTRs).



+   __u64 pdptrs[4];
+   __u64 padding;


No need to add padding; if we add more fields in the future we can use 
the flags to determine the length of the userspace data, similar to 
KVM_GET/SET_NESTED_STATE.





+   idx = srcu_read_lock(>kvm->srcu);
+   if (is_pae_paging(vcpu)) {
+   for (i = 0 ; i < 4 ; i++)
+   kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
+   kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+   mmu_reset_needed = 1;
+   }
+   srcu_read_unlock(>kvm->srcu, idx);
+


SRCU should not be needed here?


+   case KVM_GET_SREGS2: {
+   u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), 
GFP_KERNEL_ACCOUNT);
+   r = -ENOMEM;
+   if (!u.sregs2)
+   goto out;


No need to account, I think it's a little slower and this allocation is 
very short lived.



 #define KVM_CAP_PPC_DAWR1 194
+#define KVM_CAP_SREGS2 196


195, not 196.


 #define KVM_XEN_VCPU_GET_ATTR  _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr)
 #define KVM_XEN_VCPU_SET_ATTR  _IOW(KVMIO,  0xcb, struct kvm_xen_vcpu_attr)
+
+#define KVM_GET_SREGS2 _IOR(KVMIO,  0xca, struct kvm_sregs2)
+#define KVM_SET_SREGS2 _IOW(KVMIO,  0xcb, struct kvm_sregs2)
+


It's not exactly overlapping, but please bump the ioctls to 0xcc/0xcd.

Paolo



Re: [PATCH 4/6] KVM: x86: Introduce KVM_GET_SREGS2 / KVM_SET_SREGS2

2021-04-01 Thread Maxim Levitsky
On Thu, 2021-04-01 at 16:44 +0200, Paolo Bonzini wrote:
> Just a quick review on the API:
> 
> On 01/04/21 16:18, Maxim Levitsky wrote:
> > +struct kvm_sregs2 {
> > +   /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */
> > +   struct kvm_segment cs, ds, es, fs, gs, ss;
> > +   struct kvm_segment tr, ldt;
> > +   struct kvm_dtable gdt, idt;
> > +   __u64 cr0, cr2, cr3, cr4, cr8;
> > +   __u64 efer;
> > +   __u64 apic_base;
> > +   __u64 flags; /* must be zero*/
> 
> I think it would make sense to define a flag bit for the PDPTRs, so that 
> userspace can use KVM_SET_SREGS2 unconditionally (e.g. even when 
> migrating from a source that uses KVM_GET_SREGS and therefore doesn't 
> provide the PDPTRs).
Yes, I didn't think about this case! I'll add this to the next version.
Thanks!

> 
> > +   __u64 pdptrs[4];
> > +   __u64 padding;
> 
> No need to add padding; if we add more fields in the future we can use 
> the flags to determine the length of the userspace data, similar to 
> KVM_GET/SET_NESTED_STATE.
Got it, will fix. I added it just in case.

> 
> 
> > +   idx = srcu_read_lock(>kvm->srcu);
> > +   if (is_pae_paging(vcpu)) {
> > +   for (i = 0 ; i < 4 ; i++)
> > +   kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
> > +   kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
> > +   mmu_reset_needed = 1;
> > +   }
> > +   srcu_read_unlock(>kvm->srcu, idx);
> > +
> 
> SRCU should not be needed here?

I haven't yet studied in depth the locking that is used in the kvm,
so I put this to be on the safe side.

I looked at it a bit and it looks like the pdptr reading code takes
this lock because it accesses the memslots, which is not done here,
and therefore the lock is indeed not needed here.

I need to study in depth how locking is done in kvm to be 100% sure
about this.


> 
> > +   case KVM_GET_SREGS2: {
> > +   u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), 
> > GFP_KERNEL_ACCOUNT);
> > +   r = -ENOMEM;
> > +   if (!u.sregs2)
> > +   goto out;
> 
> No need to account, I think it's a little slower and this allocation is 
> very short lived.
Right, I will fix this in the next version.

> 
> >  #define KVM_CAP_PPC_DAWR1 194
> > +#define KVM_CAP_SREGS2 196
> 
> 195, not 196.

I am also planning to add KVM_CAP_SET_GUEST_DEBUG2 for which I
used 195.
Prior to sending I rebased all of my patch series on top of kvm/queue,
but I kept the numbers just in case.

> 
> >  #define KVM_XEN_VCPU_GET_ATTR  _IOWR(KVMIO, 0xca, struct 
> > kvm_xen_vcpu_attr)
> >  #define KVM_XEN_VCPU_SET_ATTR  _IOW(KVMIO,  0xcb, struct 
> > kvm_xen_vcpu_attr)
> > +
> > +#define KVM_GET_SREGS2 _IOR(KVMIO,  0xca, struct kvm_sregs2)
> > +#define KVM_SET_SREGS2 _IOW(KVMIO,  0xcb, struct kvm_sregs2)
> > +
> 
> It's not exactly overlapping, but please bump the ioctls to 0xcc/0xcd.
Will do.


Thanks a lot for the review!

Best regards,
Maxim Levitsky

> 
> Paolo
> 




Re: [PATCH 4/6] KVM: x86: Introduce KVM_GET_SREGS2 / KVM_SET_SREGS2

2021-04-01 Thread Paolo Bonzini

On 01/04/21 19:10, Maxim Levitsky wrote:

I haven't yet studied in depth the locking that is used in the kvm,
so I put this to be on the safe side.

I looked at it a bit and it looks like the pdptr reading code takes
this lock because it accesses the memslots, which is not done here,
and therefore the lock is indeed not needed here.

I need to study in depth how locking is done in kvm to be 100% sure
about this.


Yes, SRCU protects reading the memslots (and therefore accessing guest 
memory).



195, not 196.


I am also planning to add KVM_CAP_SET_GUEST_DEBUG2 for which I
used 195.


Sounds good then.

Paolo