Re: [Qemu-devel] [QEMU PATCH v3 7/9] KVM: i386: Add support for save and restore nested state

2019-06-18 Thread Maran Wilson

On 6/17/2019 10:56 AM, Liran Alon wrote:

Kernel commit 8fcc4b5923af ("kvm: nVMX: Introduce KVM_CAP_NESTED_STATE")
introduced new IOCTLs to extract and restore vCPU state related to
Intel VMX & AMD SVM.

Utilize these IOCTLs to add support for migration of VMs which are
running nested hypervisors.

Reviewed-by: Nikita Leshenko 
Signed-off-by: Liran Alon 
---
  accel/kvm/kvm-all.c   |   8 ++
  include/sysemu/kvm.h  |   1 +
  target/i386/cpu.h |   3 +
  target/i386/kvm.c |  80 +
  target/i386/machine.c | 196 ++
  5 files changed, 288 insertions(+)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 59a3aa3a40da..4fdf5b04b131 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -88,6 +88,7 @@ struct KVMState
  #ifdef KVM_CAP_SET_GUEST_DEBUG
  QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
  #endif
+int max_nested_state_len;
  int many_ioeventfds;
  int intx_set_mask;
  bool sync_mmu;
@@ -1678,6 +1679,8 @@ static int kvm_init(MachineState *ms)
  s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
  #endif
  
+s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);

+
  #ifdef KVM_CAP_IRQ_ROUTING
  kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
  #endif
@@ -2245,6 +2248,11 @@ int kvm_has_debugregs(void)
  return kvm_state->debugregs;
  }
  
+int kvm_max_nested_state_length(void)

+{
+return kvm_state->max_nested_state_len;
+}
+
  int kvm_has_many_ioeventfds(void)
  {
  if (!kvm_enabled()) {
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 64f55e519df7..acd90aebb6c4 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -210,6 +210,7 @@ bool kvm_has_sync_mmu(void);
  int kvm_has_vcpu_events(void);
  int kvm_has_robust_singlestep(void);
  int kvm_has_debugregs(void);
+int kvm_max_nested_state_length(void);
  int kvm_has_pit_state2(void);
  int kvm_has_many_ioeventfds(void);
  int kvm_has_gsi_routing(void);
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 79d9495ceb0c..a6bb71849869 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1350,6 +1350,9 @@ typedef struct CPUX86State {
  #if defined(CONFIG_KVM) || defined(CONFIG_HVF)
  void *xsave_buf;
  #endif
+#if defined(CONFIG_KVM)
+struct kvm_nested_state *nested_state;
+#endif
  #if defined(CONFIG_HVF)
  HVFX86EmulatorState *hvf_emul;
  #endif
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index f43e2d69859e..5950c3ed0d1c 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -931,6 +931,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
  struct kvm_cpuid_entry2 *c;
  uint32_t signature[3];
  int kvm_base = KVM_CPUID_SIGNATURE;
+int max_nested_state_len;
  int r;
  Error *local_err = NULL;
  
@@ -1331,6 +1332,24 @@ int kvm_arch_init_vcpu(CPUState *cs)

  if (has_xsave) {
  env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
  }
+
+max_nested_state_len = kvm_max_nested_state_length();
+if (max_nested_state_len > 0) {
+assert(max_nested_state_len >= offsetof(struct kvm_nested_state, 
data));
+env->nested_state = g_malloc0(max_nested_state_len);
+
+env->nested_state->size = max_nested_state_len;
+
+if (IS_INTEL_CPU(env)) {
+struct kvm_vmx_nested_state_hdr *vmx_hdr =
+>nested_state->hdr.vmx;
+
+vmx_hdr->vmxon_pa = -1ull;
+vmx_hdr->vmcs12_pa = -1ull;
+}
+
+}
+
  cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
  
  if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {

@@ -1352,12 +1371,18 @@ int kvm_arch_init_vcpu(CPUState *cs)
  int kvm_arch_destroy_vcpu(CPUState *cs)
  {
  X86CPU *cpu = X86_CPU(cs);
+CPUX86State *env = >env;
  
  if (cpu->kvm_msr_buf) {

  g_free(cpu->kvm_msr_buf);
  cpu->kvm_msr_buf = NULL;
  }
  
+if (env->nested_state) {

+g_free(env->nested_state);
+env->nested_state = NULL;
+}
+
  return 0;
  }
  
@@ -3072,6 +3097,52 @@ static int kvm_get_debugregs(X86CPU *cpu)

  return 0;
  }
  
+static int kvm_put_nested_state(X86CPU *cpu)

+{
+CPUX86State *env = >env;
+int max_nested_state_len = kvm_max_nested_state_length();
+
+if (max_nested_state_len <= 0) {
+return 0;
+}
+
+assert(env->nested_state->size <= max_nested_state_len);
+return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
+}
+
+static int kvm_get_nested_state(X86CPU *cpu)
+{
+CPUX86State *env = >env;
+int max_nested_state_len = kvm_max_nested_state_length();
+int ret;
+
+if (max_nested_state_len <= 0) {
+return 0;
+}
+
+/*
+ * It is possible that migration restored a smaller size into
+ * nested_state->hdr.size than what our kernel support.
+ * We preserve migration origin nested_state->hdr.size for
+ * call to KVM_SET_NESTED_STATE but 

Re: [Qemu-devel] [QEMU PATCH v3 7/9] KVM: i386: Add support for save and restore nested state

2019-06-18 Thread Paolo Bonzini
On 18/06/19 17:48, Dr. David Alan Gilbert wrote:
>> Currently, KVM folks (including myself), haven’t decided yet to expose 
>> vmcs12 struct layout to userspace but instead to still leave it opaque.
>> The formal size of this size is VMCS12_SIZE (defined in kernel as 0x1000). I 
>> was wondering if we wish to expose VMCS12_SIZE constant to userspace or not.
>> So currently I defined these __u8 arrays as 0x1000. But in case Paolo agrees 
>> to expose VMCS12_SIZE, we can use that instead.
> Well if it's not defined it's bound to change at some state!
> Also, do we need to clear it before we get it from the kernel - e.g.
> is the kernel guaranteed to give us 0x1000 ?

It is defined, it is the size of data.vmx[0].vmx12.  We can define it.

Paolo



Re: [Qemu-devel] [QEMU PATCH v3 7/9] KVM: i386: Add support for save and restore nested state

2019-06-18 Thread Liran Alon



> On 18 Jun 2019, at 18:48, Dr. David Alan Gilbert  wrote:
> 
> * Liran Alon (liran.a...@oracle.com) wrote:
>> 
>>> On 18 Jun 2019, at 12:03, Dr. David Alan Gilbert  
>>> wrote:
>>> 
>>> * Liran Alon (liran.a...@oracle.com) wrote:
 
 +static const VMStateDescription vmstate_vmx_vmcs12 = {
 +  .name = "cpu/kvm_nested_state/vmx/vmcs12",
 +  .version_id = 1,
 +  .minimum_version_id = 1,
 +  .needed = vmx_vmcs12_needed,
 +  .fields = (VMStateField[]) {
 +  VMSTATE_UINT8_ARRAY(data.vmx[0].vmcs12,
 +  struct kvm_nested_state, 0x1000),
>>> 
>>> Where did that magic 0x1000 come from?
>> 
>> Currently, KVM folks (including myself), haven’t decided yet to expose 
>> vmcs12 struct layout to userspace but instead to still leave it opaque.
>> The formal size of this size is VMCS12_SIZE (defined in kernel as 0x1000). I 
>> was wondering if we wish to expose VMCS12_SIZE constant to userspace or not.
>> So currently I defined these __u8 arrays as 0x1000. But in case Paolo agrees 
>> to expose VMCS12_SIZE, we can use that instead.
> 
> Well if it's not defined it's bound to change at some state!

I agree it’s better to expose VMCS12_SIZE to userspace but I didn’t want to be 
the one that decides this.
Let’s let Paolo decide and modify this patch accordingly if he decides to 
expose it.

> Also, do we need to clear it before we get it from the kernel - e.g.
> is the kernel guaranteed to give us 0x1000 ?

Userspace don’t need to clear it before getting it from kernel.
It does guarantee to give you 0x1000. See vmx_get_nested_state() implementation 
in kernel.

-Liran

> 
> Dave
> 
>> -Liran
>> 
>>> --
>>> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
>> 
> --
> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




Re: [Qemu-devel] [QEMU PATCH v3 7/9] KVM: i386: Add support for save and restore nested state

2019-06-18 Thread Liran Alon


> On 18 Jun 2019, at 12:03, Dr. David Alan Gilbert  wrote:
> 
> * Liran Alon (liran.a...@oracle.com) wrote:
>> 
>> +static const VMStateDescription vmstate_vmx_vmcs12 = {
>> +.name = "cpu/kvm_nested_state/vmx/vmcs12",
>> +.version_id = 1,
>> +.minimum_version_id = 1,
>> +.needed = vmx_vmcs12_needed,
>> +.fields = (VMStateField[]) {
>> +VMSTATE_UINT8_ARRAY(data.vmx[0].vmcs12,
>> +struct kvm_nested_state, 0x1000),
> 
> Where did that magic 0x1000 come from?

Currently, KVM folks (including myself), haven’t decided yet to expose vmcs12 
struct layout to userspace but instead to still leave it opaque.
The formal size of this size is VMCS12_SIZE (defined in kernel as 0x1000). I 
was wondering if we wish to expose VMCS12_SIZE constant to userspace or not.
So currently I defined these __u8 arrays as 0x1000. But in case Paolo agrees to 
expose VMCS12_SIZE, we can use that instead.

-Liran

> --
> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




Re: [Qemu-devel] [QEMU PATCH v3 7/9] KVM: i386: Add support for save and restore nested state

2019-06-18 Thread Dr. David Alan Gilbert
* Liran Alon (liran.a...@oracle.com) wrote:
> 
> > On 18 Jun 2019, at 12:03, Dr. David Alan Gilbert  
> > wrote:
> > 
> > * Liran Alon (liran.a...@oracle.com) wrote:
> >> 
> >> +static const VMStateDescription vmstate_vmx_vmcs12 = {
> >> +  .name = "cpu/kvm_nested_state/vmx/vmcs12",
> >> +  .version_id = 1,
> >> +  .minimum_version_id = 1,
> >> +  .needed = vmx_vmcs12_needed,
> >> +  .fields = (VMStateField[]) {
> >> +  VMSTATE_UINT8_ARRAY(data.vmx[0].vmcs12,
> >> +  struct kvm_nested_state, 0x1000),
> > 
> > Where did that magic 0x1000 come from?
> 
> Currently, KVM folks (including myself), haven’t decided yet to expose vmcs12 
> struct layout to userspace but instead to still leave it opaque.
> The formal size of this size is VMCS12_SIZE (defined in kernel as 0x1000). I 
> was wondering if we wish to expose VMCS12_SIZE constant to userspace or not.
> So currently I defined these __u8 arrays as 0x1000. But in case Paolo agrees 
> to expose VMCS12_SIZE, we can use that instead.

Well if it's not defined it's bound to change at some state!
Also, do we need to clear it before we get it from the kernel - e.g.
is the kernel guaranteed to give us 0x1000 ?

Dave

> -Liran
> 
> > --
> > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK



Re: [Qemu-devel] [QEMU PATCH v3 7/9] KVM: i386: Add support for save and restore nested state

2019-06-18 Thread Dr. David Alan Gilbert
* Liran Alon (liran.a...@oracle.com) wrote:
> Kernel commit 8fcc4b5923af ("kvm: nVMX: Introduce KVM_CAP_NESTED_STATE")
> introduced new IOCTLs to extract and restore vCPU state related to
> Intel VMX & AMD SVM.
> 
> Utilize these IOCTLs to add support for migration of VMs which are
> running nested hypervisors.
> 
> Reviewed-by: Nikita Leshenko 
> Signed-off-by: Liran Alon 
> ---
>  accel/kvm/kvm-all.c   |   8 ++
>  include/sysemu/kvm.h  |   1 +
>  target/i386/cpu.h |   3 +
>  target/i386/kvm.c |  80 +
>  target/i386/machine.c | 196 ++
>  5 files changed, 288 insertions(+)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 59a3aa3a40da..4fdf5b04b131 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -88,6 +88,7 @@ struct KVMState
>  #ifdef KVM_CAP_SET_GUEST_DEBUG
>  QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
>  #endif
> +int max_nested_state_len;
>  int many_ioeventfds;
>  int intx_set_mask;
>  bool sync_mmu;
> @@ -1678,6 +1679,8 @@ static int kvm_init(MachineState *ms)
>  s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
>  #endif
>  
> +s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
> +
>  #ifdef KVM_CAP_IRQ_ROUTING
>  kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 
> 0);
>  #endif
> @@ -2245,6 +2248,11 @@ int kvm_has_debugregs(void)
>  return kvm_state->debugregs;
>  }
>  
> +int kvm_max_nested_state_length(void)
> +{
> +return kvm_state->max_nested_state_len;
> +}
> +
>  int kvm_has_many_ioeventfds(void)
>  {
>  if (!kvm_enabled()) {
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index 64f55e519df7..acd90aebb6c4 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -210,6 +210,7 @@ bool kvm_has_sync_mmu(void);
>  int kvm_has_vcpu_events(void);
>  int kvm_has_robust_singlestep(void);
>  int kvm_has_debugregs(void);
> +int kvm_max_nested_state_length(void);
>  int kvm_has_pit_state2(void);
>  int kvm_has_many_ioeventfds(void);
>  int kvm_has_gsi_routing(void);
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 79d9495ceb0c..a6bb71849869 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -1350,6 +1350,9 @@ typedef struct CPUX86State {
>  #if defined(CONFIG_KVM) || defined(CONFIG_HVF)
>  void *xsave_buf;
>  #endif
> +#if defined(CONFIG_KVM)
> +struct kvm_nested_state *nested_state;
> +#endif
>  #if defined(CONFIG_HVF)
>  HVFX86EmulatorState *hvf_emul;
>  #endif
> diff --git a/target/i386/kvm.c b/target/i386/kvm.c
> index f43e2d69859e..5950c3ed0d1c 100644
> --- a/target/i386/kvm.c
> +++ b/target/i386/kvm.c
> @@ -931,6 +931,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
>  struct kvm_cpuid_entry2 *c;
>  uint32_t signature[3];
>  int kvm_base = KVM_CPUID_SIGNATURE;
> +int max_nested_state_len;
>  int r;
>  Error *local_err = NULL;
>  
> @@ -1331,6 +1332,24 @@ int kvm_arch_init_vcpu(CPUState *cs)
>  if (has_xsave) {
>  env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
>  }
> +
> +max_nested_state_len = kvm_max_nested_state_length();
> +if (max_nested_state_len > 0) {
> +assert(max_nested_state_len >= offsetof(struct kvm_nested_state, 
> data));
> +env->nested_state = g_malloc0(max_nested_state_len);
> +
> +env->nested_state->size = max_nested_state_len;
> +
> +if (IS_INTEL_CPU(env)) {
> +struct kvm_vmx_nested_state_hdr *vmx_hdr =
> +>nested_state->hdr.vmx;
> +
> +vmx_hdr->vmxon_pa = -1ull;
> +vmx_hdr->vmcs12_pa = -1ull;
> +}
> +
> +}
> +
>  cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
>  
>  if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
> @@ -1352,12 +1371,18 @@ int kvm_arch_init_vcpu(CPUState *cs)
>  int kvm_arch_destroy_vcpu(CPUState *cs)
>  {
>  X86CPU *cpu = X86_CPU(cs);
> +CPUX86State *env = >env;
>  
>  if (cpu->kvm_msr_buf) {
>  g_free(cpu->kvm_msr_buf);
>  cpu->kvm_msr_buf = NULL;
>  }
>  
> +if (env->nested_state) {
> +g_free(env->nested_state);
> +env->nested_state = NULL;
> +}
> +
>  return 0;
>  }
>  
> @@ -3072,6 +3097,52 @@ static int kvm_get_debugregs(X86CPU *cpu)
>  return 0;
>  }
>  
> +static int kvm_put_nested_state(X86CPU *cpu)
> +{
> +CPUX86State *env = >env;
> +int max_nested_state_len = kvm_max_nested_state_length();
> +
> +if (max_nested_state_len <= 0) {
> +return 0;
> +}
> +
> +assert(env->nested_state->size <= max_nested_state_len);
> +return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
> +}
> +
> +static int kvm_get_nested_state(X86CPU *cpu)
> +{
> +CPUX86State *env = >env;
> +int max_nested_state_len = kvm_max_nested_state_length();
> +int ret;
> +
> +if (max_nested_state_len <= 0) {
> +return 0;
>